# Data Cleaning / Pre-Processing

In [1]:
import pandas as pd

data = pd.read_csv("../csv/uncleaned/unseen_input.csv")
data.dropna()
data

Unnamed: 0,review_post
0,I wanna go strawberry picking
1,I wanna go boating and biking
2,Worship God in a Church and pray!
3,market place since I wanna buy strawberry jam ...


In [2]:
import re
import nltk

from sklearn.feature_extraction import text
stop_words = list(text.ENGLISH_STOP_WORDS)
stop_words[0:5]

['being', 'amount', 'due', 'how', 'were']

In [3]:
_data = data.copy()

_data['removed_special_charas_review_post'] = _data['review_post'].map(lambda x: re.sub('[^A-Za-z ]+', ' ', str(x)))
_data


Unnamed: 0,review_post,removed_special_charas_review_post
0,I wanna go strawberry picking,I wanna go strawberry picking
1,I wanna go boating and biking,I wanna go boating and biking
2,Worship God in a Church and pray!,Worship God in a Church and pray
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...


In [4]:
_data['to_lower_case_review_post'] = _data['removed_special_charas_review_post'].map(lambda x: x.lower())
_data

Unnamed: 0,review_post,removed_special_charas_review_post,to_lower_case_review_post
0,I wanna go strawberry picking,I wanna go strawberry picking,i wanna go strawberry picking
1,I wanna go boating and biking,I wanna go boating and biking,i wanna go boating and biking
2,Worship God in a Church and pray!,Worship God in a Church and pray,worship god in a church and pray
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...,market place since i wanna buy strawberry jam ...


In [5]:
_data['removed_stop_words_review_post'] = _data['to_lower_case_review_post'].map(lambda x: " ".join(x for x in x.split() if x not in stop_words))
_data

Unnamed: 0,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post
0,I wanna go strawberry picking,I wanna go strawberry picking,i wanna go strawberry picking,wanna strawberry picking
1,I wanna go boating and biking,I wanna go boating and biking,i wanna go boating and biking,wanna boating biking
2,Worship God in a Church and pray!,Worship God in a Church and pray,worship god in a church and pray,worship god church pray
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...,market place since i wanna buy strawberry jam ...,market place wanna buy strawberry jam ube jam ...


In [6]:
from nltk.stem.wordnet import WordNetLemmatizer

lem = WordNetLemmatizer()

_data['lemmatized_words_review_post'] = _data['removed_stop_words_review_post'].map(lambda x: " ".join(lem.lemmatize(str(x)) for x in x.split()))
_data

Unnamed: 0,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post,lemmatized_words_review_post
0,I wanna go strawberry picking,I wanna go strawberry picking,i wanna go strawberry picking,wanna strawberry picking,wanna strawberry picking
1,I wanna go boating and biking,I wanna go boating and biking,i wanna go boating and biking,wanna boating biking,wanna boating biking
2,Worship God in a Church and pray!,Worship God in a Church and pray,worship god in a church and pray,worship god church pray,worship god church pray
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...,market place since i wanna buy strawberry jam ...,market place wanna buy strawberry jam ube jam ...,market place wanna buy strawberry jam ube jam ...


In [7]:
_data.to_csv('../csv/cleaned/cleaned_unseen.csv', index = False)

In [9]:
import spacy

selected_tags = ["NOUN", "ADJ", "ADV", "PROPN", "VERB", "X"]
tag_compil = {"NOUN" : [], "ADJ" : [], "ADV" : [], "PROPN" : [], "VERB" : [], "X" : []}

for idx, value in enumerate(list(_data['lemmatized_words_review_post'])):
    print("Loop: ", idx)
    sp = spacy.load('en_core_web_sm')
    sen = sp(value)

    pos_tagged = []
    selected_tags = {"NOUN" : 0, "ADJ" : 0, "ADV" : 0, "PROPN" : 0, "VERB" : 0, "X" : 0}

    for word in sen:
        if word.pos_ in selected_tags.keys():
            selected_tags[str(word.pos_)] += 1
            #pos_tagged.append((word.text, word.pos_))
    
    for idx, value in tag_compil.items():
        tag_compil[str(idx)].append(selected_tags[str(idx)])

tag_compil

Loop:  0
Loop:  1
Loop:  2
Loop:  3


{'NOUN': [0, 1, 2, 2],
 'ADJ': [0, 0, 0, 1],
 'ADV': [0, 0, 0, 0],
 'PROPN': [2, 1, 1, 6],
 'VERB': [1, 1, 1, 1],
 'X': [0, 0, 0, 0]}

In [10]:
new = pd.DataFrame.from_dict(tag_compil)

new.to_csv("../csv/uncleaned/unseen_pos_tags.csv", index = False)
new

Unnamed: 0,NOUN,ADJ,ADV,PROPN,VERB,X
0,0,0,0,2,1,0
1,1,0,0,1,1,0
2,2,0,0,1,1,0
3,2,1,0,6,1,0


In [11]:
for tag in selected_tags:
    _data[str(tag)] = tag_compil[str(tag)]

_data

Unnamed: 0,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post,lemmatized_words_review_post,NOUN,ADJ,ADV,PROPN,VERB,X
0,I wanna go strawberry picking,I wanna go strawberry picking,i wanna go strawberry picking,wanna strawberry picking,wanna strawberry picking,0,0,0,2,1,0
1,I wanna go boating and biking,I wanna go boating and biking,i wanna go boating and biking,wanna boating biking,wanna boating biking,1,0,0,1,1,0
2,Worship God in a Church and pray!,Worship God in a Church and pray,worship god in a church and pray,worship god church pray,worship god church pray,2,0,0,1,1,0
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...,market place since i wanna buy strawberry jam ...,market place wanna buy strawberry jam ube jam ...,market place wanna buy strawberry jam ube jam ...,2,1,0,6,1,0


In [12]:
from textblob import TextBlob

def analyze_score(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

_data['subjectivity'] = _data['lemmatized_words_review_post'].map(lambda x: TextBlob(x).sentiment.subjectivity)
_data['polarity'] = _data['lemmatized_words_review_post'].map(lambda x: TextBlob(x).sentiment.polarity)
_data['sent_analysis'] = _data['polarity'].map(lambda x: analyze_score(x))

_data.drop(['subjectivity', 'polarity'], axis=1, inplace = True)
_data

Unnamed: 0,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post,lemmatized_words_review_post,NOUN,ADJ,ADV,PROPN,VERB,X,sent_analysis
0,I wanna go strawberry picking,I wanna go strawberry picking,i wanna go strawberry picking,wanna strawberry picking,wanna strawberry picking,0,0,0,2,1,0,Neutral
1,I wanna go boating and biking,I wanna go boating and biking,i wanna go boating and biking,wanna boating biking,wanna boating biking,1,0,0,1,1,0,Neutral
2,Worship God in a Church and pray!,Worship God in a Church and pray,worship god in a church and pray,worship god church pray,worship god church pray,2,0,0,1,1,0,Neutral
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...,market place since i wanna buy strawberry jam ...,market place wanna buy strawberry jam ube jam ...,market place wanna buy strawberry jam ube jam ...,2,1,0,6,1,0,Neutral


In [13]:
_data.to_csv("../csv/cleaned/cleaned_unseen_input.csv", index = False)
_data

Unnamed: 0,review_post,removed_special_charas_review_post,to_lower_case_review_post,removed_stop_words_review_post,lemmatized_words_review_post,NOUN,ADJ,ADV,PROPN,VERB,X,sent_analysis
0,I wanna go strawberry picking,I wanna go strawberry picking,i wanna go strawberry picking,wanna strawberry picking,wanna strawberry picking,0,0,0,2,1,0,Neutral
1,I wanna go boating and biking,I wanna go boating and biking,i wanna go boating and biking,wanna boating biking,wanna boating biking,1,0,0,1,1,0,Neutral
2,Worship God in a Church and pray!,Worship God in a Church and pray,worship god in a church and pray,worship god church pray,worship god church pray,2,0,0,1,1,0,Neutral
3,market place since I wanna buy strawberry jam ...,market place since I wanna buy strawberry jam ...,market place since i wanna buy strawberry jam ...,market place wanna buy strawberry jam ube jam ...,market place wanna buy strawberry jam ube jam ...,2,1,0,6,1,0,Neutral


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy

x = _data[['NOUN', 'ADJ', 'ADV', 'PROPN', 'VERB', 'X']]

tfv = TfidfVectorizer(stop_words = 'english')
tfv_text = tfv.fit_transform(_data['lemmatized_words_review_post'].values.astype('U'))

tfv_text_df = pd.DataFrame(tfv_text.todense())
combined = pd.concat([x, tfv_text_df], axis=1, join='inner')

sparsed_data = scipy.sparse.csr_matrix(combined.values)

In [22]:
combined

Unnamed: 0,NOUN,ADJ,ADV,PROPN,VERB,X,0,1,2,3,...,6,7,8,9,10,11,12,13,14,15
0,0,0,0,2,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.702035,0.0,0.0,0.0,0.553492,0.0,0.4481,0.0
1,1,0,0,1,1,0,0.0,0.644503,0.644503,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.411378,0.0
2,2,0,0,1,1,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.5
3,2,1,0,6,1,0,0.301115,0.0,0.0,0.301115,...,0.602229,0.301115,0.0,0.301115,0.0,0.301115,0.237402,0.301115,0.192198,0.0


In [23]:
import pickle

# load it again
with open('../models/svm_classifier.pkl', 'rb') as fid:
    svm = pickle.load(fid)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [24]:
dest_pred = svm.predict(sparsed_data)
actual_predict = pd.DataFrame()
actual_predict["actual"] = dest_test
actual_predict["predict"] = dest_pred
actual_predict

ValueError: X has 22 features, but SVC is expecting 8326 features as input.

In [None]:
from sklearn.metrics import classification_report
print(classification_report(dest_test, dest_pred))

# calculate accuracy of class predictions
from sklearn import metrics
print(f'Accuracy -> ', metrics.accuracy_score(dest_test, dest_pred))