In [2]:
import pandas as pd
import re
import nltk

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import train_test_split

port = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /home/rohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rohan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rohan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/rohan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
initial_data = pd.read_csv('labelled_ShapeOfYou.csv',encoding='utf8');
initial_data.head()

Unnamed: 0,Author,Comment,Polarity
0,Don Gillies,Nothing better than being young and in love. ...,1.0
1,Dogeball,this song. . .,0.0
2,Varifiable Jelly,5 years and it ain t stopped. Dang,0.0
3,0,there is literally nobody in this world who do...,0.0
4,Nirmala Gowda,Me I 39,0.0


In [4]:
initial_data.shape

(16333, 3)

In [5]:
#make new dataframe for utilizing
df = initial_data[['Author','Comment','Polarity']].copy()
df.head()

Unnamed: 0,Author,Comment,Polarity
0,Don Gillies,Nothing better than being young and in love. ...,1.0
1,Dogeball,this song. . .,0.0
2,Varifiable Jelly,5 years and it ain t stopped. Dang,0.0
3,0,there is literally nobody in this world who do...,0.0
4,Nirmala Gowda,Me I 39,0.0


In [6]:
#convert comments to lower case
# def lowercaseConverter(text):
#     lower_text = [str(text[i]).lower() for i in range(len(text))]
#     return lower_text

In [7]:
#TESTING OUT TOKENIZATION
#sentence tokenizing
#sent_tok = [ sent_tokenize(str(sent)) for sent in df['Comment']]

#word tokenizing
#word_tok = [ word_tokenize(str(sent)) for sent in df['Comment']]

In [8]:
#removing special characters
def specialCharacterRemover(word_tok):
    no_special_char = []
    for words in word_tok:
        clean = []
        for w in words:
            res =re.sub(r'[^\w\s]',"",w)
            if res != "":
                clean.append(res)
        no_special_char.append(clean)
    return no_special_char

In [9]:
#removing stop words
def stopWordsRemover(text):
    no_stopwords = []
    for words in text:
        w=[]
        for word in words:
            if not word in stopwords.words('english'):
                w.append(word)
        no_stopwords.append(w)
    return no_stopwords

In [10]:
#lemmatizing
def lemmatizeWord(text):
    lemmatized_words =[]
    for words in text:
        w=[]
        for word in words:
            a = port.lemmatize(word)
            w.append(a)
        lemmatized_words.append(w)
    return lemmatized_words

In [41]:
def wordListToString(list_words):
    temp=[]
    t=""
    for words in list_words:
        t = " ".join(words)
        temp.append(t)
    return(temp)

In [11]:
X = df['Comment']
y = df['Polarity']

In [12]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100, shuffle=True)

In [13]:
print("Training data points:",len(y_train),"; Testing data points:",len(y_test))

Training data points: 11433 ; Testing data points: 4900


In [42]:
#parameter is a list and return is also a list type
def getCleanedText(text_data):
    #word tokenize
    tokens = [ word_tokenize(str(sent)) for sent in text_data]#lower_text]
    #remove special characters
    no_special = specialCharacterRemover(tokens)
    #remove stop words
    no_stopwords = stopWordsRemover(no_special)
    #lemmatize text
    lemmatized_text = lemmatizeWord(no_stopwords)
    #list of sentences
    clean_text = wordListToString(lemmatized_text)
    return clean_text

In [43]:
X_clean = getCleanedText(X_train)

In [45]:
Xt_clean = getCleanedText(X_test)

In [47]:
#vectorize
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range =(1,2))

def vectorize_text(list_text):
    vec = []
    vec.append(cv.fit_transform(list_text).toarray())         
    print(vec)
    return vec

In [49]:
#bag of word model
X_vec = vectorize_text(X_clean)
Xt_vec = vectorize_text(Xt_clean)

[array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])]
[array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])]


In [51]:
print(cv.get_feature_names())



