In [78]:
import pandas as pd

In [79]:
df = pd.read_csv("netflix_reviews.csv")

Here is a preview of the dataset:

In [80]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,NR Bharadwaj,Plsssss stoppppp giving screen limit like when...,2,0,8.120.0 build 10 50712,2024-07-02 17:17:53,8.120.0 build 10 50712
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Maxwell Ntloko,Good,5,1,,2024-06-26 15:38:06,
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,Dilhani Mahanama,👍👍,5,0,,2024-06-24 15:29:54,
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Karen Gulli,Good,3,0,,2024-06-22 15:41:54,
4,8be10073-2368-4677-b828-9ff5d06ea0b7,Ronny Magadi,"App is useful to certain phone brand ,,,,it is...",1,0,8.105.0 build 15 50626,2024-06-22 05:16:03,8.105.0 build 15 50626


There are many columns we don't need. So we can just remove them.

In [81]:
df = df[['reviewId','content', 'score']]

Here is the dataset with the dropped columns:

In [82]:
df.head()

Unnamed: 0,reviewId,content,score
0,cc1cfcd2-dc8a-4ead-88d1-7f2b2dbb2662,Plsssss stoppppp giving screen limit like when...,2
1,7dfb1f90-f185-4e81-a97f-d38f0128e5a4,Good,5
2,3009acc4-8554-41cf-88de-cc5e2f6e45b2,👍👍,5
3,b3d27852-9a3b-4f74-9e16-15434d3ee324,Good,3
4,8be10073-2368-4677-b828-9ff5d06ea0b7,"App is useful to certain phone brand ,,,,it is...",1


Let's check now if we have any duplicate rows or any missing values.

In [83]:
def show_details(dataset):
    missed_values = dataset.isnull().sum()
    missed_values_percent = (dataset.isnull().sum()) / len(dataset)
    duplicated_values = dataset.duplicated().sum()
    duplicated_values_percent = (dataset.duplicated().sum()) / len(dataset)
    info_frame = pd.DataFrame({'Missing_Values' : missed_values , 
                              'Missing_Values %' :missed_values_percent,
                              'Duplicated values' :duplicated_values,
                              'Duplicated values %':duplicated_values_percent})
    return info_frame.T

In [84]:
show_details(df)

Unnamed: 0,reviewId,content,score
Missing_Values,0.0,2.0,0.0
Missing_Values %,0.0,1.8e-05,0.0
Duplicated values,316.0,316.0,316.0
Duplicated values %,0.002781,0.002781,0.002781


Since the missing values and duplicates are just a tiny percentage, we decided to simply drop them.

In [85]:
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

You can see now that we don't have any missing or duplicate value:

In [86]:
show_details(df)

Unnamed: 0,reviewId,content,score
Missing_Values,0.0,0.0,0.0
Missing_Values %,0.0,0.0,0.0
Duplicated values,0.0,0.0,0.0
Duplicated values %,0.0,0.0,0.0


At this point we can begin adding a new column in this dataset...

In [87]:
def determine_sentiment(score):
    if score > 3:
        return 'positive'
    elif score < 3:
        return 'negative'
    else:
        return 'neutral'

We generate the new column by applying the function written above.

In [88]:
df['sentiment'] = df['score'].apply(determine_sentiment)


At this point, we can remove the "ReviewId" column. We used it only for removing the duplicate rows.

And lastly, we can rewrite the column names for clarity.

In [89]:
df = df[['content', 'score', 'sentiment']]
df.columns = ['Content', 'Score', 'Sentiment']

df.head()

Unnamed: 0,Content,Score,Sentiment
0,Plsssss stoppppp giving screen limit like when...,2,negative
1,Good,5,positive
2,👍👍,5,positive
3,Good,3,neutral
4,"App is useful to certain phone brand ,,,,it is...",1,negative


We can save the cleaned up dataset now.

In [90]:
df.to_csv('cleaned_data.csv', index=False)