In [1]:
import pandas as pd
import seaborn as sns

In [2]:
def makeAndCleanDF ():
    avengersEndgame = pd.read_csv('Avengers Endgame.csv')
    forrestGump = pd.read_csv('Forrest Gump.csv')
    johnWickChapterThree = pd.read_csv('John Wick Chapter 3.csv')
    joker = pd.read_csv('Joker.csv')
    morbius = pd.read_csv('Morbius.csv')
    pulpFiction = pd.read_csv('Pulp Fiction.csv')
    spiderManNoWayHome = pd.read_csv('SpiderMan No Way Home.csv')
    theAvengers = pd.read_csv('The Avengers.csv')
    theDarkKnight = pd.read_csv('The Dark Knight.csv')
    thorRagnarok = pd.read_csv('Thor Ragnarok.csv')
    df = pd.concat([avengersEndgame, forrestGump, johnWickChapterThree, joker, morbius, pulpFiction, spiderManNoWayHome, theAvengers, theDarkKnight, thorRagnarok], ignore_index=True)
    df = df.rename(columns = {"User's Rating out of 10":"Rating", "Usefulness Vote": "Usefulness", "Total Votes": "Total"})
    df.dropna(inplace = True)
    df['Review'] = df['Review'].astype(str)
    df['Review Title'] = df['Review Title'].astype(str)
    df['Rating'] = df['Rating'].str.strip()
    df = df[df.Rating != "Was this review helpful?  Sign in to vote."]
    df['Rating'] = df['Rating'].astype(int)
    return df

In [3]:
df = makeAndCleanDF ()

In [4]:
def getDFTwo ():
    partOne = pd.read_json('part-01.json')
    partTwo = pd.read_json('part-02.json')
    partThree = pd.read_json('part-03.json')
    partFour = pd.read_json('part-04.json')
    partFive = pd.read_json('part-05.json')
    partSix = pd.read_json('part-06.json')
    sample = pd.read_json('sample.json')
    
    df2 = pd.concat([partOne, partTwo, partThree, partFour, partFive, partSix, sample], ignore_index=True)
    
    return df2

In [5]:
df2 = getDFTwo()

In [6]:
def addOtherDataToDataFrame (df, df2):
    df2.dropna(inplace = True)
    df2.reset_index(drop=True, inplace=True)
    helpfulness = []
    for row in range(len(df2)):
        helpful = df2["helpful"][row]
        template = {"Usefulness": helpful[0], "Total":helpful[1]}
        helpfulness.append(template)
    helpfulness = pd.DataFrame.from_dict(helpfulness)
    df2 = pd.concat([df2, helpfulness], axis=1)
    df2[['review_summary', 'review_detail', 'Usefulness', 'Total']] = df2[['review_summary', 'review_detail', 'Usefulness', 'Total']].astype(str)
    df2[['Usefulness', 'Total']] = df2[['Usefulness', 'Total']].replace(",","", regex=True)
    df2[['Usefulness', 'Total']] = df2[['Usefulness', 'Total']].astype(float)
    df2 = df2.drop(axis = 1, columns = ['review_id', 'movie', 'spoiler_tag', 'helpful'])
    df2 = df2.rename(columns = {"reviewer":"User", "rating":"Rating", "review_summary":"Review Title", "review_date":"Date of Review", "review_detail":"Review"})
    df = pd.concat([df, df2], ignore_index=True)
    return df

In [7]:
df = addOtherDataToDataFrame (df, df2)

In [8]:
df.reset_index(drop=True, inplace=True)

In [9]:
df.describe()

Unnamed: 0,Usefulness,Total,Rating
count,5042337.0,5042337.0,5042337.0
mean,7.623624,13.78257,6.75974
std,33.7943,51.54313,2.986639
min,0.0,0.0,1.0
25%,0.0,1.0,5.0
50%,2.0,4.0,8.0
75%,6.0,11.0,9.0
max,14477.0,15948.0,10.0


In [10]:
def dropNonUsefulRows (df):
    df = df.drop(df.loc[df['Total'] == 0].index)
    df = df[df.Usefulness/df.Total > 0.5]
    return df

In [11]:
df = dropNonUsefulRows (df)

In [12]:
df.describe()

Unnamed: 0,Usefulness,Total,Rating
count,1918283.0,1918283.0,1918283.0
mean,14.63878,21.08642,7.031092
std,51.67788,74.31625,2.93152
min,1.0,1.0,1.0
25%,2.0,3.0,5.0
50%,5.0,7.0,8.0
75%,12.0,17.0,10.0
max,14477.0,15948.0,10.0


In [13]:
df['Rating'].loc[df.Rating <= 5] = 0
df['Rating'].loc[df.Rating > 5] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rating'].loc[df.Rating > 5] = 1


In [14]:
df = df[["Rating", "Review Title", "Review"]]

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1918283 entries, 0 to 5042336
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   Rating        float64
 1   Review Title  object 
 2   Review        object 
dtypes: float64(1), object(2)
memory usage: 58.5+ MB


In [16]:
df["Review"] = df['Review Title'].astype(str) + " " + df["Review"].astype(str)

In [17]:
df = df[["Rating", "Review"]]

In [18]:
def featurize (df):
    featurized = []
    onlyNegative = df[df.Rating == 0]
    onlyPositive = df[df.Rating == 1]
    positiveReviews, negativeReviews = ' '.join(onlyPositive['Review']), ' '.join(onlyNegative['Review'])
    positiveReviews, negativeReviews = positiveReviews.lower(), negativeReviews.lower()
    positiveReviews, negativeReviews = positiveReviews.split(), negativeReviews.split()
    #unrepeated = list(set(positiveReviews + negativeReviews))
    return featurized, positiveReviews, negativeReviews

In [19]:
featurized, positiveReviews, negativeReviews = featurize (df)

In [22]:
unrepeated = list(set(positiveReviews + negativeReviews))

In [23]:
len(unrepeated)

5049425

In [25]:
fullList = positiveReviews + negativeReviews

KeyboardInterrupt: 

In [None]:
for word in unrepeated:
    negativeTotal = negativeReviews.count(word)
    positiveTotal = positiveReviews.count(word)
    template = {"Word":word, "Positive":positiveTotal , "Negative":negativeTotal}
    featurized.append(template)

In [None]:
def fixPunctuation (allTexts):
    allTexts = allTexts.replace(".", " ")
    allTexts = allTexts.replace(",", " ")
    allTexts = allTexts.replace("?", " ")
    allTexts = allTexts.replace("!", " ")
    allTexts = allTexts.replace(":", " ")
    allTexts = allTexts.replace(";", " ")
    allTexts = allTexts.replace("/", " ")
    allTexts = allTexts.replace(")", " ")
    allTexts = allTexts.replace("(", " ")
    allTexts = allTexts.replace("'", " ")
    allTexts = allTexts.replace("'", " ")
    
    
    return allTexts

In [None]:
def fixText (text):
    text = text.lower()
    
    text = fixPunctuation (text)
        
    text = text.split()
    
    return text

In [None]:
def combineText (df):
    text = ' '.join(df["Review"])
    
    return text

In [None]:
def featurize (df):
    allTexts = combineText (df)
    
    allTexts = fixText (allTexts)
    
    unrepeatedWords = list(set(allTexts))
    
    greaterThanFive = df[df.Rating > 5]
    greaterThanFive = combineText (greaterThanFive)
    greaterThanFive = fixText (greaterThanFive)
    lessThanSix = df[df.Rating < 6]
    lessThanSix = combineText (lessThanSix)
    lessThanSix = fixText (lessThanSix)
    
    numberOfWords = []
    
    for word in unrepeatedWords: 
        greaterThanFiveCount = greaterThanFive.count(word)
        lessThanSixCount = lessThanSix.count(word)
        numberOfWords = {"Word":word, "greaterThanFive":greaterThanFiveCount, "lessThanSix":lessThanSixCount}
    
    numberOfWords = pd.DataFrame.from_dict(numberOfWords)
    
    return df, numberOfWords

In [None]:
df, numberOfWords = featurize (df)

In [None]:
def makeColumns (df):
    allTexts = ""
    reviewTitle = ' '.join(df["Review Title"])
    review = ' '.join(df["Review"])
    allTexts = reviewTitle + " " + review
    
    allTexts = allTexts.lower()
    
    allTexts = fixPunctuation (allTexts)
        
    allTexts = allTexts.split()
    
    dic = {}
        
    for word in allTexts:
        if word in dic:
            x += 1
        else:
            x = 1
    
        dic[word] = x
        
    nonRelevantWords = [key for key, val in dic.items() if val <= 1]
    relevantWords = {key:[] for key, val in dic.items() if val > 1}
    
    relevantWords = pd.DataFrame.from_dict(relevantWords)
    
    df = pd.concat([df, relevantWords], ignore_index=True)
    
    return df, nonRelevantWords

In [None]:
df, nonRelevantWords = makeColumns (df)

In [None]:
def getValues (df, nonRelevantWords):
    end = len(df)
    
    for i in range (end):
        wordsForRow = df["Review Title"][i] + " " + df["Review"][i]
        
        wordsForRow = wordsForRow.lower()
    
        wordsForRow = fixPunctuation (wordsForRow)
        
        wordsForRow = wordsForRow.split()
        
        unrepeatedWords = list(set(wordsForRow))
        
        for word in unrepeatedWords:
            if word not in nonRelevantWords:
                df.loc[i, word] = wordsForRow.count(word)
            else:
                pass
    
    df = df.fillna(0)
    
    return df

In [None]:
df = getValues (df, nonRelevantWords)

In [None]:
df

In [None]:
cor = df.corr()

In [None]:
cor

In [None]:
for column in cor:
    if not(abs(cor['Rating'][column]) >= 0.01):
        df = df.drop(columns = column)

In [None]:
df