Data Pre-processing 

In [1]:
import pandas as pd

In [2]:
df=pd.read_csv("netflix_reviews.csv")

In [3]:
df.shape

(136904, 8)

In [4]:
df=df.drop(['reviewId', 'userName', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'appVersion'], axis=1)

In [33]:
#Handling null values

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136904 entries, 0 to 136903
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   content  136898 non-null  object
 1   score    136904 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.1+ MB


In [6]:
df.isnull().sum()

content    6
score      0
dtype: int64

In [7]:
df = df.dropna(subset=['content'])


In [8]:
df.isnull().sum()

content    0
score      0
dtype: int64

In [38]:
#Handling duplicates

In [9]:
df.duplicated().sum()

np.int64(5418)

In [10]:
df = df.drop_duplicates()


In [11]:
print(df.duplicated().sum())  

0


In [42]:
#Handling wrong datatypes

In [12]:
unique_content=df['content'].map(type).unique()
print(unique_content)

[<class 'str'>]


In [13]:
unique_score=df['score'].map(type).unique()
print(unique_score)

[<class 'int'>]


In [45]:
#Handling outliers

In [14]:
print(df.describe())

               score
count  131480.000000
mean        2.793033
std         1.711308
min         1.000000
25%         1.000000
50%         3.000000
75%         5.000000
max         5.000000


In [15]:
negative=(df.select_dtypes(include=['number'])<0).any()
print(negative)

score    False
dtype: bool


In [16]:
# Count empty strings
(df['content'].str.strip() == '').sum()

np.int64(0)

In [17]:
# Count extremely short reviews 
df[df['content'].str.len() < 3]

Unnamed: 0,content,score
40,ok,5
107,😍,5
158,👍,5
279,👏👏,5
467,😜,5
...,...,...
27917,🤸,5
28543,🍂,5
28683,🤣e,5
28800,aq,2


In [18]:
#Drop extremely short reviews
df = df[df['content'].str.len() >= 3]

In [19]:
# Rows with non-ASCII characters
non_ascii = df[~df['content'].apply(lambda x: x.isascii())]
print(non_ascii.head())


                                              content  score
8                            send more games please 🙏      3
11  omg like wait I love Netflix uh its has what I...      5
20                                     আলহামদুলিল্লাহ      5
35                                             nice 👍      5
37  bad application for movies because all old mov...      1


In [20]:
# keep only rows where the 'content' column contains ASCII-only 
df = df[df['content'].apply(lambda x: x.isascii())]


In [21]:
import re

# Rows where content is only punctuation or symbols
df['only_symbols'] = df['content'].apply(lambda x: bool(re.fullmatch(r'\W+', x.strip())))
df = df[~df['only_symbols']]
df = df.drop(columns=['only_symbols'])

In [22]:
# Remove rows where the 'content' column contains only non-word characters
df = df[~df['content'].str.match(r'^\W+$')]

In [23]:
from langdetect import detect, LangDetectException

def safe_detect(text):
    try:
        return detect(text)
    except LangDetectException:
        return 'unknown'

df['lang'] = df['content'].apply(safe_detect)

# Keep only English
df = df[df['lang'] == 'en']

df = df.drop(columns=['lang'])


In [24]:
# Save cleaned csv file
df.to_csv('netflix_cleaned_reviews.csv', index=False)


In [25]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Removing stop words
nltk.download('stopwords')
nltk.download('wordnet')

tokenizer = TreebankWordTokenizer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)          #Remove non-letter characters
    text = re.sub(r'\s+', ' ', text).strip()      #Remove extra whitespace
    tokens = tokenizer.tokenize(text)             
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    tokens = [w for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)


df = pd.read_csv('netflix_cleaned_reviews.csv')

df = df.dropna(subset=['content'])

df['content'] = df['content'].apply(clean_text)

df = df[df['content'].str.strip() != '']

print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/avishka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/avishka/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                             content  score
0                             good better experience      5
1  experience good helpful watch movie drama easi...      5
2  give option adjusting speed talk fast also dat...      4
3  service month time movie place mine didnt like...      1
4  working fine get proper sync phone continuatio...      4


In [None]:
#Save to a txt file
df.to_csv("clean_reviews.txt", sep='\t', index=False, header=False)
