# Coursera Reviews Sentiment Analysis Model

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Major Project/reviews.csv')
print(df) 


            Id                                             Review  Label
0            0                               good and interesting      5
1            1  This class is very helpful to me. Currently, I...      5
2            2  like!Prof and TAs are helpful and the discussi...      5
3            3  Easy to follow and includes a lot basic and im...      5
4            4  Really nice teacher!I could got the point eazl...      4
...        ...                                                ...    ...
107013  107013  Trendy topic with talks from expertises in the...      4
107014  107014  Wonderful! Simple and clear language, good ins...      5
107015  107015   an interesting and fun course. thanks. dr quincy      5
107016  107016  very broad perspective, up to date information...      4
107017  107017  An informative course on the social and financ...      4

[107018 rows x 3 columns]


In [3]:
x = df.iloc[0:20000,1]
y = df.iloc[0:20000,2]

In [4]:
import re
processed_features = []

for sentence in range(0, len(x)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(x[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(processed_features,y,test_size=0.2, random_state=0,stratify=y)

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
text_model = Pipeline([('tfidf',TfidfVectorizer()),('model',SVC())])

In [7]:
import numpy as np
np.unique(y_test,return_counts=True)

(array([1, 2, 3, 4, 5]), array([ 144,  125,  251,  785, 2695]))

In [8]:
import numpy as np
np.unique(y_train,return_counts=True)

(array([1, 2, 3, 4, 5]), array([  577,   502,  1005,  3138, 10778]))

In [9]:
text_model.fit(x_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', d

In [10]:
y_pred = text_model.predict(x_test)



In [11]:
y_pred

array([5, 5, 5, ..., 5, 5, 5])

In [12]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [13]:
accuracy_score(y_pred,y_test)*100

71.72500000000001

In [14]:
confusion_matrix(y_pred,y_test)

array([[  35,   20,    6,    0,    2],
       [   0,    2,    4,    0,    0],
       [  18,   14,   16,   17,    5],
       [  19,   42,  113,  217,   89],
       [  72,   47,  112,  551, 2599]])

In [15]:
import joblib
joblib.dump(text_model,'Movie Review Sentiment')

['Movie Review Sentiment']

In [16]:
!pip install streamlit --quiet
!pip install pyngrok==4.1.1 --quiet
from pyngrok import ngrok

[K     |████████████████████████████████| 8.2MB 5.8MB/s 
[K     |████████████████████████████████| 4.2MB 54.3MB/s 
[K     |████████████████████████████████| 112kB 59.3MB/s 
[K     |████████████████████████████████| 81kB 12.3MB/s 
[K     |████████████████████████████████| 163kB 59.6MB/s 
[K     |████████████████████████████████| 122kB 58.9MB/s 
[K     |████████████████████████████████| 71kB 11.7MB/s 
[?25h  Building wheel for blinker (setup.py) ... [?25l[?25hdone
[31mERROR: google-colab 1.0.0 has requirement ipykernel~=4.10, but you'll have ipykernel 5.5.3 which is incompatible.[0m
  Building wheel for pyngrok (setup.py) ... [?25l[?25hdone


In [31]:
%%writefile app.py
import streamlit as st
import sklearn
import joblib
model = joblib.load('Movie Review Sentiment')
st.title('Movie Sentiment')
ip = st.text_input('Enter your review')
op = model.predict([ip])
if st.button('Predict'):
  temp = op[0]
  if temp==5:
    st.title("Loved It! (Rating: 5)")
  elif temp==4:
    st.title("It's Good (Rating: 4)")
  elif temp==3:
    st.title("It's Neutral (Rating: 3)")
  elif temp==2:
    st.title("It's Bad (Rating: 2)")
  elif temp==1:
    st.title("It's the worst course User have ever seen! (Rating: 1)")

Overwriting app.py


In [18]:
!nohup streamlit run app.py &
url=ngrok.connect(port='8501')
url

nohup: appending output to 'nohup.out'


'http://694c078b3cc2.ngrok.io'