In [3]:
import pandas as pd
df = pd.read_fwf("test.ft.txt",header = None,index = False, names=['Rating','Review'])

In [4]:
df.head()

Unnamed: 0,Rating,Review
0,__label__2,Great CD: My lovely Pat has one of the GREAT v...
1,__label__2,One of the best game music soundtracks - for a...
2,__label__1,Batteries died within a year ...: I bought thi...
3,__label__2,"works fine, but Maha Energy is better: Check o..."
4,__label__2,Great for the non-audiophile: Reviewed quite a...


In [5]:
df['Rating'] = df['Rating'].replace(['__label__1'],'Negative')
df['Rating'] = df['Rating'].replace(['__label__2'],'Positive')
df.head()

Unnamed: 0,Rating,Review
0,Positive,Great CD: My lovely Pat has one of the GREAT v...
1,Positive,One of the best game music soundtracks - for a...
2,Negative,Batteries died within a year ...: I bought thi...
3,Positive,"works fine, but Maha Energy is better: Check o..."
4,Positive,Great for the non-audiophile: Reviewed quite a...


In [7]:
import nltk
nltk.download('stopwords')
stopwords_list = nltk.corpus.stopwords.words('english')
stopwords_list.remove('no')
stopwords_list.remove('not')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
import re
import string
#remove special characters
def remove_sp(text):
    text = text.lower()
    text = re.sub('\[.*?\]',"",text)
    text = re.sub('[%s]' %re.escape(string.punctuation), "", text)
    text = re.sub('\w*\d\w',"",text)
    text = re.sub('[''""_]', "", text)
    text = re.sub('\n',"", text)
    return text

cleaned = lambda x:remove_sp(x)
    

In [9]:
df['Review'] = df['Review'].apply(cleaned)

In [10]:
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()

#remove stopwords
def remove_stopwords(text):
  tokens = tokenizer.tokenize(text)
  tokens = [token.strip() for token in tokens]
  filtered_tokens = [token for token in tokens if token not in stopwords_list]
  filtered_text = ' '.join(filtered_tokens)
  return filtered_text

cleaned2 = lambda x:remove_stopwords(x)

In [11]:
df['Review'] = df['Review'].apply(cleaned2)

In [12]:
x = df['Review'].values
y = df['Rating'].values

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.55,random_state = 4)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

clf = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [15]:
clf.fit(x_train, y_train)
predicted = clf.predict(x_test)

In [16]:
from  sklearn.metrics  import accuracy_score,confusion_matrix
predicted = clf.predict(x_test)
print(accuracy_score(y_test,predicted))
print(confusion_matrix(y_test,predicted))

0.8454416116290381
[[95926 14055]
 [19948 90072]]


In [17]:
import pickle
pickle.dump(clf,open('SentimentAnalysis.p','wb'))

In [18]:
model = pickle.load(open('SentimentAnalysis.p','rb'))

In [20]:
%%writefile app.py
import streamlit as st
from PIL import Image
import nltk
import re
import string
import pickle
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize.toktok import ToktokTokenizer
import nltk

st.title("Sentiment Analysis")
st.subheader("Enter Text to analyise: ")
text = st.text_input(" ")
text = [text]
y_out = model.predict(text)

    if st.button("Predict"):
        
        if (y_out == "Positive"):
            image = Image.open("happy.jpeg")
            st.image(image,width = 250)
            st.header("WOW!! That's Positive review")
        else:
            image = Image.open("sad.jpeg")
            st.image(image,width = 250)
            st.header("That's Negative review")

Overwriting app.py
