In [49]:
"""
Sentiment
Analysis

Student A applied Textblob to classify positive and negative reviews
Student B used Pandas to analyse the result of the sentiment analysis

Collocation
Extraction

Student A implemented the frequency based collocation extraction
Student B implemented the frequency based collocation extraction with pos filtering
"""

'\nSentiment\nAnalysis\n\nStudent A applied Textblob to classify positive and negative reviews\nStudent B used Pandas to analyse the result of the sentiment analysis\n\nCollocation\nExtraction\n\nStudent A implemented the frequency based collocation extraction\nStudent B implemented the frequency based collocation extraction with pos filtering\n'

In [50]:
# Student A applied Textblob to classify positive and negative reviews
# Chris     

In [51]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import plotly.express as px
from textblob import TextBlob

In [52]:
df = pd.read_csv(r"Movie_reviews.csv", sep="\t", header = None)
df.columns = ["ReviewID", "MovieID", "MovieReview"]

print(df.head(3))

   ReviewID     MovieID                                        MovieReview
0         1  076780192X  it always amazes me how people can rate the DV...
1         2  0767821599  This movie is okay, but, its not worth what th...
2         3  0782008380  If you love the Highlander 1 movie and the ser...


In [53]:

df["MovieReview"] = df["MovieReview"].replace("[?,.():;'!£@#<>/\"&]", '', regex=True) # removing punctuation
df["MovieReview"] = df["MovieReview"].str.lower() # making lower case
print(df.head(3))



   ReviewID     MovieID                                        MovieReview
0         1  076780192X  it always amazes me how people can rate the dv...
1         2  0767821599  this movie is okay but its not worth what they...
2         3  0782008380  if you love the highlander 1 movie and the ser...


In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ReviewID     10000 non-null  int64 
 1   MovieID      10000 non-null  object
 2   MovieReview  9998 non-null   object
dtypes: int64(1), object(2)
memory usage: 234.5+ KB


In [55]:
# For loop to check all reviews uploaded corrrectly
"""
for x in df["Review"]: # For row in the review column
    print(x) # Prints each review
    print(type(x)) # To check they are strings
    print("\n") # For a space for each review
"""
print(df.head(5))

   ReviewID     MovieID                                        MovieReview
0         1  076780192X  it always amazes me how people can rate the dv...
1         2  0767821599  this movie is okay but its not worth what they...
2         3  0782008380  if you love the highlander 1 movie and the ser...
3         4  0767726227  this is a great classic collection if you like...
4         5  0780621832  this is the second of john ford and john wayne...


In [56]:
# To check individ sentance score
"""
for z in df["Review"]: # For row in review
    zx = str(x)
    blob = TextBlob(z) # Input review
    for sentance in blob.sentences: # Checks each sentance in review
        print (sentance, '\t', sentance.sentiment.polarity) #Prints individ sentance score
    print("\n") # Space for each review
"""

'\nfor z in df["Review"]: # For row in review\n    zx = str(x)\n    blob = TextBlob(z) # Input review\n    for sentance in blob.sentences: # Checks each sentance in review\n        print (sentance, \'\t\', sentance.sentiment.polarity) #Prints individ sentance score\n    print("\n") # Space for each review\n'

In [57]:
# Calculates score of the each review

listOfScore = []
for x in df["MovieReview"]: # For each row in the dataset
    w = str(x)
    score = 0
    blob = TextBlob(w) # Input review
    for sentence in blob.sentences: # Checks each sentence in the dataset
        score += sentence.sentiment.polarity
    listOfScore.append(score)
print(listOfScore[0:10])

[0.18740079365079368, 0.2214285714285714, 0.05333333333333335, 0.2722222222222222, 0.05132936507936508, 0.03866666666666667, 0.06341991341991343, 0.06627535385962352, 0.048214285714285716, 0.1607843137254902]


In [58]:
len(listOfScore) # Check all scores are added

10000

In [59]:
def getSentiment(score):
    if score < 0:
        listForDF.append("Negative")
    elif score == 0:
        listForDF.append("Neutral")
    else:
        listForDF.append("Positive")

In [60]:
listForDF = [] # Empty list to add sentiment
for value in listOfScore:
    getSentiment(value) # Function call
len(listForDF) # Check size

10000

In [61]:
# Add new columns with sentiment score and meaning of score e.g pos or neg
df['Sentiment Num'] = listOfScore
df['Sentiment'] = listForDF
df.head(5) # Check dataset

Unnamed: 0,ReviewID,MovieID,MovieReview,Sentiment Num,Sentiment
0,1,076780192X,it always amazes me how people can rate the dv...,0.187401,Positive
1,2,0767821599,this movie is okay but its not worth what they...,0.221429,Positive
2,3,0782008380,if you love the highlander 1 movie and the ser...,0.053333,Positive
3,4,0767726227,this is a great classic collection if you like...,0.272222,Positive
4,5,0780621832,this is the second of john ford and john wayne...,0.051329,Positive


In [77]:
df.to_csv("data.csv")

In [62]:
# Student B used Pandas to analyse the result of the sentiment analysis
# Charlotte

In [63]:
fig = px.scatter(df, y = "Sentiment Num", x = "Sentiment", color = "Sentiment")
fig.update_traces(marker_size = 10)
fig.show()

In [64]:
px.histogram(df, x = "Sentiment", color = "Sentiment")

In [65]:
px.box(df, x="Sentiment", y = "Sentiment Num", color = "Sentiment", points = "all")

In [66]:
px.strip(df, x = "Sentiment", y = "Sentiment Num", color = "Sentiment").update_traces(jitter = 1)

In [67]:
px.strip(df, y = "Sentiment Num", color = "Sentiment").update_traces(jitter=1)


In [68]:
import nltk
import re

In [69]:
def collocation(data):
    word = []
    for x in data["MovieReview"]: # every review
        token = nltk.word_tokenize(x) # adds all words into a list
        output = list(nltk.bigrams(token)) # generates bigrams from the list
        for i in output: # adds each bigram to the list
            word.append(i)

    bigrams_w_freq = {}
    for first, second in word: # first and second word in the bigram
        if (first, second) in bigrams_w_freq: # if they are already in the dictionary
            bigrams_w_freq[(first, second)] += 1 
        #elif (second, first) in bigrams_w_freq: # if they are already in the dictionary in other combination
         #   bigrams_w_freq[(first, second)] += 1
        else: # if they are not already in the dictionary
            bigrams_w_freq[(first, second)] = 1

    freq_sorted = dict(sorted(bigrams_w_freq.items(), key = lambda x: x[1], reverse=True)) # sorts them numerically
    counts = dict(list(freq_sorted.items())[0:40]) # first 40 values
    print(counts)

In [70]:
positive = df[df["Sentiment"] == "Positive"]
collocation(positive)

{('of', 'the'): 11001, ('in', 'the'): 6115, ('this', 'movie'): 3670, ('is', 'a'): 3570, ('and', 'the'): 3277, ('to', 'the'): 3200, ('the', 'film'): 2882, ('it', 'is'): 2771, ('the', 'movie'): 2764, ('to', 'be'): 2761, ('this', 'is'): 2620, ('this', 'film'): 2517, ('on', 'the'): 2482, ('for', 'the'): 2233, ('with', 'the'): 2074, ('it', 'was'): 2021, ('is', 'the'): 1864, ('one', 'of'): 1855, ('as', 'a'): 1748, ('of', 'a'): 1722, ('in', 'a'): 1691, ('from', 'the'): 1584, ('in', 'this'): 1542, ('if', 'you'): 1537, ('at', 'the'): 1426, ('as', 'the'): 1421, ('i', 'was'): 1421, ('of', 'this'): 1346, ('that', 'the'): 1291, ('to', 'see'): 1261, ('the', 'story'): 1247, ('i', 'have'): 1207, ('the', 'first'): 1206, ('with', 'a'): 1185, ('by', 'the'): 1164, ('and', 'i'): 1151, ('was', 'a'): 1141, ('is', 'not'): 1136, ('all', 'the'): 1121, ('there', 'is'): 1110}


In [71]:
negative = df[df["Sentiment"] == "Negative"]
collocation(negative)

{('of', 'the'): 1872, ('this', 'movie'): 1327, ('in', 'the'): 1238, ('is', 'a'): 687, ('the', 'movie'): 684, ('and', 'the'): 647, ('to', 'be'): 640, ('this', 'film'): 579, ('the', 'film'): 579, ('this', 'is'): 569, ('it', 'was'): 558, ('to', 'the'): 543, ('it', 'is'): 511, ('for', 'the'): 426, ('on', 'the'): 424, ('of', 'a'): 405, ('movie', 'is'): 398, ('one', 'of'): 392, ('in', 'a'): 384, ('is', 'the'): 380, ('with', 'the'): 377, ('in', 'this'): 372, ('if', 'you'): 354, ('as', 'a'): 320, ('i', 'was'): 315, ('at', 'the'): 301, ('the', 'first'): 293, ('that', 'the'): 282, ('i', 'have'): 281, ('the', 'worst'): 274, ('is', 'not'): 269, ('from', 'the'): 265, ('to', 'see'): 257, ('all', 'the'): 255, ('of', 'this'): 255, ('with', 'a'): 255, ('was', 'a'): 253, ('there', 'is'): 251, ('for', 'a'): 247, ('out', 'of'): 246}


In [72]:
#neutral = df[df["Sentiment"] == "Neutral"]
#print(neutral.head(2))
#collocation(neutral)

In [114]:
from nltk import word_tokenize
from nltk import pos_tag
from nltk import bigrams
def POS(data):
    word = []
    for i in data['MovieReview']:
        words = word_tokenize(i) # adds all words to a list
        words_with_POS_tags = pos_tag(words) # assigns POS tags to all
        bigrams_with_POS_tags = list(bigrams(words_with_POS_tags)) # generates bigrams from the list
        for i in bigrams_with_POS_tags:  # adds each bigram to the list
            word.append(i)
        
    bigrams_with_freq = {}
    for first, second in word:
        first_word, first_POS = first[0], first[1]
        second_word, second_POS = second[0], second[1]
        if (first_word, second_word) in bigrams_with_freq: # already in the dictionary
            bigrams_with_freq[(first_word, second_word)] += 1
        elif (second_word, first_word) in bigrams_with_freq: # if they are already in the dictionary in other combination
            bigrams_with_freq[(second_word, first_word)] += 1
        else: # not in the dictionary
            bigrams_with_freq[(first_word, second_word)] = 1
    
    freq_sorted = dict(sorted(bigrams_with_freq.items(), key=lambda x: x[1], reverse=True))  # sorts them numerically
    counts = dict(list(freq_sorted.items())[0:40])  # first 40 values
    print(counts)


In [87]:
positive = df[df["Sentiment"] == "Positive"]
POS(positive)

{('of', 'the'): 11001, ('in', 'the'): 6115, ('this', 'movie'): 3670, ('is', 'a'): 3570, ('and', 'the'): 3277, ('to', 'the'): 3200, ('the', 'film'): 2882, ('it', 'is'): 2771, ('the', 'movie'): 2764, ('to', 'be'): 2761, ('this', 'is'): 2620, ('this', 'film'): 2517, ('on', 'the'): 2482, ('for', 'the'): 2233, ('with', 'the'): 2074, ('it', 'was'): 2021, ('is', 'the'): 1864, ('one', 'of'): 1855, ('as', 'a'): 1748, ('of', 'a'): 1722, ('in', 'a'): 1691, ('from', 'the'): 1584, ('in', 'this'): 1542, ('if', 'you'): 1537, ('at', 'the'): 1426, ('as', 'the'): 1421, ('i', 'was'): 1421, ('of', 'this'): 1346, ('that', 'the'): 1291, ('to', 'see'): 1261, ('the', 'story'): 1247, ('i', 'have'): 1207, ('the', 'first'): 1206, ('with', 'a'): 1185, ('by', 'the'): 1164, ('and', 'i'): 1151, ('was', 'a'): 1141, ('is', 'not'): 1136, ('all', 'the'): 1121, ('there', 'is'): 1110}


In [115]:
negative = df[df["Sentiment"] == "Negative"]
POS(negative)

{('of', 'the'): 1872, ('this', 'movie'): 1354, ('in', 'the'): 1239, ('the', 'movie'): 757, ('is', 'a'): 687, ('and', 'the'): 647, ('to', 'be'): 645, ('is', 'this'): 629, ('the', 'film'): 621, ('this', 'film'): 595, ('it', 'was'): 570, ('it', 'is'): 566, ('to', 'the'): 543, ('for', 'the'): 426, ('on', 'the'): 425, ('one', 'of'): 413, ('of', 'a'): 405, ('movie', 'is'): 400, ('in', 'this'): 396, ('in', 'a'): 384, ('is', 'the'): 381, ('with', 'the'): 378, ('if', 'you'): 365, ('i', 'was'): 326, ('is', 'that'): 320, ('as', 'a'): 320, ('at', 'the'): 301, ('the', 'first'): 300, ('i', 'have'): 292, ('that', 'the'): 282, ('the', 'worst'): 276, ('is', 'not'): 269, ('there', 'is'): 268, ('from', 'the'): 265, ('to', 'see'): 260, ('all', 'the'): 257, ('of', 'this'): 257, ('with', 'a'): 255, ('was', 'a'): 253, ('and', 'i'): 247}


In [76]:
#neutral = df[df["Sentiment"] == "Neutral"]
#POS(neutral)

In [116]:
## TEST!!!

from nltk import word_tokenize
from nltk import pos_tag
from nltk import bigrams


def POS(data):
    word = []
    for i in data['MovieReview']:
        words = word_tokenize(i)  # adds all words to a list
        words_with_POS_tags = pos_tag(words)  # assigns POS tags to all
        # generates bigrams from the list
        bigrams_with_POS_tags = list(bigrams(words_with_POS_tags))
        for i in bigrams_with_POS_tags:  # adds each bigram to the list
            word.append(i)

    bigrams_with_freq = {}
    for first, second in word:
        first_word, first_POS = first[0], first[1]
        second_word, second_POS = second[0], second[1]
        if first_POS in ("IN", "TO", "DT", "CC"): # removing certain POS tags
            continue
        elif second_POS in ("IN", "TO", "DT", "CC"):
            continue
        elif (first_word, second_word) in bigrams_with_freq:  # already in the dictionary
            bigrams_with_freq[(first_word, second_word)] += 1
            # if they are already in the dictionary in other combination
        elif (second_word, first_word) in bigrams_with_freq:
            bigrams_with_freq[(second_word, first_word)] += 1
        else:  # not in the dictionary
            bigrams_with_freq[(first_word, second_word)] = 1

    freq_sorted = dict(sorted(bigrams_with_freq.items(
    ), key=lambda x: x[1], reverse=True))  # sorts them numerically
    counts = dict(list(freq_sorted.items())[0:40])  # first 40 values
    print(counts)

negative = df[df["Sentiment"] == "Negative"]
POS(negative)


{('it', 'was'): 570, ('it', 'is'): 566, ('movie', 'is'): 400, ('i', 'was'): 326, ('i', 'have'): 292, ('is', 'not'): 269, ('there', 'is'): 268, ('film', 'is'): 224, ('have', 'been'): 216, ('movie', 'was'): 208, ('i', 'dont'): 181, ('there', 'are'): 173, ('i', 'am'): 164, ('i', 'would'): 161, ('is', 'one'): 161, ('is', 'so'): 159, ('would', 'have'): 159, ('they', 'are'): 158, ('movie', 'i'): 142, ('i', 'can'): 142, ('is', 'just'): 139, ('he', 'is'): 139, ('you', 'can'): 138, ('would', 'be'): 132, ('you', 'are'): 126, ('i', 'had'): 123, ('i', 'think'): 121, ('he', 'was'): 116, ('that', 'is'): 116, ('i', 'thought'): 115, ('do', 'not'): 115, ('i', 'didnt'): 114, ('you', 'have'): 113, ('i', 'cant'): 112, ('ever', 'seen'): 110, ('which', 'is'): 109, ('special', 'effects'): 106, ('they', 'were'): 105, ('should', 'have'): 99, ('when', 'i'): 99}
