In [4]:
import pandas
data = pandas.read_csv('/content/Dataset.csv')

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score
import pickle
from sklearn.metrics import classification_report

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
## Encoding the target values - ham to 0 and Spam to 1

encoder = LabelEncoder()
data['Sentiment'] = encoder.fit_transform(data['Sentiment'])

In [7]:
data['Sentiment'].unique()


array([2, 0, 1])

In [8]:
# Get the mapping of labels to encoded values
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))

# Print the label mapping
print(label_mapping)

{'Negative': 0, 'Neutral': 1, 'Positive': 2}


In [9]:
data

Unnamed: 0,User_Comments,Labels,Sentiment
0,The housing market in this neighborhood is thr...,Housing,2
1,Prices for houses in this area have skyrockete...,Housing,0
2,There seems to be a mix of older homes and new...,Housing,1
3,I just found the perfect house in this area! T...,Housing,2
4,"With ongoing community development projects, I...",Housing,2
5,The sense of community in this neighborhood is...,Housing,2
6,"Wi-Fi connectivity is inconsistent and slow, w...",Housing,0
7,"The quality of local schools is below average,...",Housing,0
8,With no cultural or entertainment venues nearb...,Housing,0
9,The neighborhood lacks adequate lighting in pu...,Housing,0


In [11]:
## Preprocessing
# - Checking for missing values
# - Checking for duplicates and keeping only one

def preprocess(data):
    print("Null Values: ", data.isnull().sum())
    data = data.drop_duplicates(keep='first')
    print("Duplicate data: ", data.duplicated().sum())
    print("Shape : ", data.shape)
    print("Target Variable Value Count:", data['Sentiment'].value_counts())

preprocess(data)

Null Values:  User_Comments    0
Labels           0
Sentiment        0
dtype: int64
Duplicate data:  0
Shape :  (46, 3)
Target Variable Value Count: Sentiment
0    24
2    19
1     3
Name: count, dtype: int64


In [13]:
data['num_characters'] = data['User_Comments'].apply(len)


In [14]:
data['num_of_words'] = data['User_Comments'].apply(lambda x: len(nltk.word_tokenize(x)))


In [15]:
data['num_of_sentences'] = data['User_Comments'].apply(lambda x: len(nltk.sent_tokenize(x)))


In [16]:
data.head()


Unnamed: 0,User_Comments,Labels,Sentiment,num_characters,num_of_words,num_of_sentences
0,The housing market in this neighborhood is thr...,Housing,2,129,20,1
1,Prices for houses in this area have skyrockete...,Housing,0,118,20,1
2,There seems to be a mix of older homes and new...,Housing,1,127,24,1
3,I just found the perfect house in this area! T...,Housing,2,110,23,2
4,"With ongoing community development projects, I...",Housing,2,125,22,1


In [17]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english'):
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))


    return " ".join(y)

In [18]:
data['transformed_text'] = data['User_Comments'].apply(transform_text)

In [19]:
## Converting text to numbers

cv = CountVectorizer()
tfidf = TfidfVectorizer(max_features=3000)

In [20]:
X = tfidf.fit_transform(data['transformed_text']).toarray()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
y = data['Sentiment'].values

In [24]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [25]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
#print(accuracy_score(y_test,y_pred2,average='micro'))
#print(confusion_matrix(y_test,y_pred2,average='micro'))
print(precision_score(y_test,y_pred2,average='micro'))

0.3


In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=2)


In [26]:
pickle.dump(tfidf,open('vectorizerUpdated.pkl','wb'))

pickle.dump(mnb,open('modelUpdated.pkl','wb'))