In [21]:
import pandas as pd
import numpy as np
#reading the csv file
#the data is encoded in latin encoding style hence the specification (encoding = 'latin1')
df3 = pd.read_csv('rt.reviews.tsv.gz', encoding = 'latin1', delimiter='\t')
df3.head()

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"
2,3,... life lived in a bubble in financial dealin...,,fresh,Sean Axmaker,0,Stream on Demand,"January 4, 2018"
3,3,Continuing along a line introduced in last yea...,,fresh,Daniel Kasman,0,MUBI,"November 16, 2017"
4,3,... a perverse twist on neorealism...,,fresh,,0,Cinema Scope,"October 12, 2017"


In [2]:
df3.info()

# The data reviews has 54432 rows and 8 columns
# It also contains two data types object(6) and int(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          54432 non-null  int64 
 1   review      48869 non-null  object
 2   rating      40915 non-null  object
 3   fresh       54432 non-null  object
 4   critic      51710 non-null  object
 5   top_critic  54432 non-null  int64 
 6   publisher   54123 non-null  object
 7   date        54432 non-null  object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


In [3]:
# The date is in data type object; the code below transforms it to datetime
df3['date'] = pd.to_datetime(df3['date'])

In [4]:
# Droping the column review
df3.drop(columns = 'review', axis=1, inplace=True)

In [5]:
# tranforming rating from object data type to numeric
df3['rating'] = pd.to_numeric(df3['rating'], errors='coerce')

In [6]:
#check if rating transformation was successful
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          54432 non-null  int64         
 1   rating      750 non-null    float64       
 2   fresh       54432 non-null  object        
 3   critic      51710 non-null  object        
 4   top_critic  54432 non-null  int64         
 5   publisher   54123 non-null  object        
 6   date        54432 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 2.9+ MB


In [7]:
#check the missing values
df3.isnull().sum()

id                0
rating        53682
fresh             0
critic         2722
top_critic        0
publisher       309
date              0
dtype: int64

In [8]:
# publisher has few number of missing values and therefore a decision to drop them
df3.dropna(subset=['publisher'], inplace=True)

In [9]:
df3['critic'] = df3['critic'].str.strip().str.lower()

In [10]:
df3['critic'].fillna('no critic', inplace=True)

In [11]:
df3.isnull().sum()

id                0
rating        53376
fresh             0
critic            0
top_critic        0
publisher         0
date              0
dtype: int64

In [12]:
# Forward fill the 'ratings' column with the previous non-null value
df3['rating'] = df3['rating'].fillna(method='ffill')

In [13]:
df3.isnull().sum()

id             0
rating        22
fresh          0
critic         0
top_critic     0
publisher      0
date           0
dtype: int64

In [14]:
#droping the remaining 22 rows
df3.dropna(subset=['rating'], inplace=True)

In [15]:
# check if the 22 rows were droped
df3.isnull().sum()

id            0
rating        0
fresh         0
critic        0
top_critic    0
publisher     0
date          0
dtype: int64

In [16]:
# reseting the index from so that it can begin from 0
df3.reset_index(drop=True, inplace=True)

In [17]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54101 entries, 0 to 54100
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   id          54101 non-null  int64         
 1   rating      54101 non-null  float64       
 2   fresh       54101 non-null  object        
 3   critic      54101 non-null  object        
 4   top_critic  54101 non-null  int64         
 5   publisher   54101 non-null  object        
 6   date        54101 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(2), object(3)
memory usage: 2.9+ MB


In [18]:
#saving the clean data
df3.to_csv('clean_reviews.csv', index=False)

In [19]:
clean_data = pd.read_csv('clean_reviews.csv')
clean_data.head()

Unnamed: 0,id,rating,fresh,critic,top_critic,publisher,date
0,3,8.0,fresh,philip martin,0,Arkansas Democrat-Gazette,2012-09-07
1,3,8.0,rotten,mike scott,0,Times-Picayune,2012-09-07
2,3,8.0,rotten,joe williams,1,St. Louis Post-Dispatch,2012-09-06
3,3,8.0,rotten,justin craig,0,FoxNews.com,2012-09-05
4,3,8.0,fresh,roger moore,0,Tribune News Service,2012-09-04


In [20]:
clean_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54101 entries, 0 to 54100
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          54101 non-null  int64  
 1   rating      54101 non-null  float64
 2   fresh       54101 non-null  object 
 3   critic      54101 non-null  object 
 4   top_critic  54101 non-null  int64  
 5   publisher   54101 non-null  object 
 6   date        54101 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 2.9+ MB
