In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import re
import nltk
import pickle
import math

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
# from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

mn = MultinomialNB()
port = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
path ="/content/drive/MyDrive/labelled_ShapeOfYou.csv"
initial_data = pd.read_csv(path,encoding='utf8');
initial_data.head()

Unnamed: 0,Author,Comment,Polarity
0,Don Gillies,Nothing better than being young and in love. ...,1.0
1,Dogeball,this song. . .,0.0
2,Varifiable Jelly,5 years and it ain t stopped. Dang,0.0
3,0,there is literally nobody in this world who do...,0.0
4,Nirmala Gowda,Me I 39,0.0


In [4]:
initial_data.shape

(16333, 3)

In [5]:
#make new dataframe for utilizing
# df = initial_data[['Author','Comment','Polarity']].copy()
# df.head()

In [6]:
#convert comments to lower case
# def lowercaseConverter(text):
#     lower_text = [str(text[i]).lower() for i in range(len(text))]
#     return lower_text

In [7]:
#TESTING OUT TOKENIZATION
#sentence tokenizing
#sent_tok = [ sent_tokenize(str(sent)) for sent in df['Comment']]

#word tokenizing
#word_tok = [ word_tokenize(str(sent)) for sent in df['Comment']]

In [8]:
#removing special characters
def specialCharacterRemover(word_tok):
    no_special_char = []
    for words in word_tok:
        clean = []
        for w in words:
            res =re.sub(r'[^\w\s]',"",w)
            if res != "":
                clean.append(res)
        no_special_char.append(clean)
    return no_special_char

In [9]:
#removing stop words
def stopWordsRemover(text):
    no_stopwords = []
    for words in text:
        w=[]
        for word in words:
            if not word in stopwords.words('english'):
                w.append(word)
        no_stopwords.append(w)
    return no_stopwords

In [10]:
#lemmatizing
def lemmatizeWord(text):
    lemmatized_words =[]
    for words in text:
        w=[]
        for word in words:
            a = port.lemmatize(word)
            w.append(a)
        lemmatized_words.append(w)
    return lemmatized_words

In [11]:
def wordListToString(list_words):
    temp=[]
    t=""
    for words in list_words:
        t = " ".join(words)
        temp.append(t)
    return(temp)

In [12]:
X = initial_data['Comment']
y = initial_data['Polarity']

In [13]:
#parameter is a list and return is also a list type
def getCleanedText(text_data):
    #word tokenize
    tokens = [ word_tokenize(str(sent)) for sent in text_data]#lower_text]
    #remove special characters
    no_special = specialCharacterRemover(tokens)
    #remove stop words
    no_stopwords = stopWordsRemover(no_special)
    #lemmatize text
    lemmatized_text = lemmatizeWord(no_stopwords)
    #list of sentences
    clean_text = wordListToString(lemmatized_text)
    return clean_text

In [14]:
# X_clean = getCleanedText(X)
X = getCleanedText(X)

In [15]:
#vectorize
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range =(1,2))

def vectorize_text(list_text):
    return (cv.fit_transform(list_text).toarray())             

In [16]:
#bag of word model
# X_vec = vectorize_text(X_clean)
X = vectorize_text(X)

In [17]:
print("Number of features",len(cv.get_feature_names()))
print(type(X))
print(type(y))

Number of features 68369
<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>




In [18]:
#train test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, shuffle=True)
s_f = 0.8
n_train = math.floor(s_f * X.shape[0])
n_test = math.ceil((1-s_f) * X.shape[0])
X_train = X[:n_train]
y_train = y[:n_train]
X_test = X[n_train:]
y_test = y[n_train:]
print("Total Number of rows in train:",X_train.shape[0])
print("Total Number of rows in test:",X_test.shape[0])

Total Number of rows in train: 13066
Total Number of rows in test: 3267


In [19]:
print("Training data points:",len(y_train),"; Testing data points:",len(y_test))

Training data points: 13066 ; Testing data points: 3267


In [20]:
#train in multinomial naive bayes classifier
mn.fit(X_train,y_train)

MultinomialNB()

In [21]:
#save trained model
filename = 'finalized_model.sav'
pickle.dump(mn,open(filename,'wb'))

In [23]:
# print(len(Xt_vec[0]),len(y_test))
y_pred = mn.predict(X_test)

In [33]:
#convert pandas series to list
y_test = y_test.to_list()

In [38]:
print(y_test[23],y_pred[23])

0.0 0.0


In [39]:
dict ={'label':y_test, 'prediction': y_pred}

In [41]:
final_df = pd.DataFrame(dict)

In [42]:
#save prediction and labels to csv
final_df.to_csv('prediction.csv')