# **Question 1**

In [1]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('moviereviews2.tsv', sep='\t')

data.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [3]:
data.isnull().sum()

label      0
review    20
dtype: int64

In [4]:
data.dropna(inplace=True)

# **Question 2**

In [5]:
X = data['review']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.40, random_state= 123456)

X_train.head()
print(X_train.shape, " ", y_train.shape)
print(X_test.shape, " ", y_test.shape)

(3588,)   (3588,)
(2392,)   (2392,)


# **Question 3**

In [6]:
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC())
])


text_clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])

# **Question 4**

In [7]:
predictions = text_clf.predict(X_test)

In [8]:
predictions

array(['neg', 'pos', 'pos', ..., 'pos', 'neg', 'neg'], dtype=object)

In [9]:
confusion_matrix(y_test, predictions)

array([[1064,  109],
       [  81, 1138]], dtype=int64)

In [10]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         neg       0.93      0.91      0.92      1173
         pos       0.91      0.93      0.92      1219

    accuracy                           0.92      2392
   macro avg       0.92      0.92      0.92      2392
weighted avg       0.92      0.92      0.92      2392



In [11]:
print(accuracy_score(y_test, predictions) * 100)

92.05685618729098


# **The overall accuracy is equal to**

## $\frac{True Positive + True Negative}{True Positive + False Negative + False Positive + True Negative}$

**The accuracy is equal to 0.92, it is greater than random guessing(0.5).  It shows that 92 percent of the predictions of the model are correct. This means that this model is quite a good model.**


# Recall which is also called Sensitivity is equal to

## $\frac{True Positive}{True Positive + False Negative}$

**For negative movies it is equal to 0.91. It means that 91 percent of actual negative movies are actually predicted to be negative movies.**

**For positive movies it is equal to 0.93. It means that 93 percent of actual positive movies are actually predicted to be positive movies.**


# **Precision which is also called Positive Predicted Value is equal to**

## $\frac{True Positive}{True Positive+ False Positive}$

**For negative movies it is equal to 0.93. It means that 93 percent of the predicted negative movies are actually negative movies.**

**For positive movies it is equal to 0.91. It means that 91 percent of the predicted positive movies are actually positive movies.**



# **Question 5**

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
import time
import re
import pandas as pd
from selenium.webdriver.common.action_chains import ActionChains


In [32]:
text = ["Coco (2017) - IMDb", "Soul (2020) - IMDb", "The Boss Baby: Family Business (2021) - IMDb", 
        "Luca (2021) - IMDb", "Inside Out (2015) - IMDb"]
film_review = pd.DataFrame({'review':[""]})
for i in text:
    options = Options()
    options.headlesse = True
    options.add_argument("start-normal")
    driver = webdriver.Chrome(options=options, executable_path=r"C:\Users\User\Downloads\chromedriver.exe")
    driver.get('https://www.google.com/')
    time.sleep(4)
    typing = driver.find_element_by_class_name('gLFyf')
    TEXT= i
    typing.send_keys(TEXT)
    typing.submit()
    link = driver.find_element_by_partial_link_text(i)
    link.click()
    time.sleep(3)
    user_review = driver.find_element_by_partial_link_text("User reviews")
    user_review.click()
    time.sleep(3)
    review = driver.find_element_by_class_name('text').text
    driver.close()
    film_review = film_review.append({'review': review}, ignore_index=True)

In [33]:
film_review = film_review.drop(labels = 0, axis = 0)
film_review

Unnamed: 0,review
1,Im Mexican and all i can say is Thanks you Piz...
2,I can't put into words how close to home this ...
3,"Usually my reviews are quite long in detail, b..."
4,Beautiful animation and a story of live and fr...
5,I confess that I had to watch this twice befor...


In [34]:
film_review.to_csv('film_review.csv', index = False)

In [13]:
film_review = pd.read_csv('film_review.csv')

In [14]:
film_review

Unnamed: 0,review
0,Im Mexican and all i can say is Thanks you Piz...
1,I can't put into words how close to home this ...
2,"Usually my reviews are quite long in detail, b..."
3,Beautiful animation and a story of live and fr...
4,I confess that I had to watch this twice befor...


In [15]:
A_test = film_review["review"]
A_test

0    Im Mexican and all i can say is Thanks you Piz...
1    I can't put into words how close to home this ...
2    Usually my reviews are quite long in detail, b...
3    Beautiful animation and a story of live and fr...
4    I confess that I had to watch this twice befor...
Name: review, dtype: object

In [16]:
predictions = text_clf.predict(A_test)


In [17]:
predictions

array(['pos', 'pos', 'neg', 'pos', 'pos'], dtype=object)

## Coco, Soul, Luca, Inside Out cartoons are predicted as positive cartoons, and The Boss Baby: Family Business cartoon is predicted as negative cartoon. 

