# Namazbayev Almas

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [2]:
# nltk.download('stopwords')
data = pd.read_csv("data.tsv", delimiter='	', header=0)
data = data[0:5000]
X = data[['PhraseId', 'SentenceId', 'Phrase']]
y = data['Sentiment']
print(data['Phrase'])

0       A series of escapades demonstrating the adage ...
1       A series of escapades demonstrating the adage ...
2                                                A series
3                                                       A
4                                                  series
                              ...                        
4995                                                veers
4996                                from its comic course
4997                                     its comic course
4998                                         comic course
4999                                               course
Name: Phrase, Length: 5000, dtype: object


# Stopwords

In [3]:
stopWords = set(stopwords.words('english'))
data['Phrase'] = data['Phrase'].apply(lambda x: " ".join([item for item in (x.lower()).split(" ") if item not in stopWords]))
print(data['Phrase'])

0       series escapades demonstrating adage good goos...
1         series escapades demonstrating adage good goose
2                                                  series
3                                                        
4                                                  series
                              ...                        
4995                                                veers
4996                                         comic course
4997                                         comic course
4998                                         comic course
4999                                               course
Name: Phrase, Length: 5000, dtype: object


# Stemming

In [4]:
stemming = PorterStemmer()
data['Phrase'] = data['Phrase'].apply(lambda x: stemming.stem(x))
print(data['Phrase'])

0       series escapades demonstrating adage good goos...
1          series escapades demonstrating adage good goos
2                                                    seri
3                                                        
4                                                    seri
                              ...                        
4995                                                 veer
4996                                          comic cours
4997                                          comic cours
4998                                          comic cours
4999                                                cours
Name: Phrase, Length: 5000, dtype: object


# Bag of words and tfIdf

In [5]:
wordsbag = CountVectorizer()
Xwordsbag = wordsbag.fit_transform(data['Phrase'])
print("Bag of Words:", Xwordsbag.shape)

Bag of Words: (5000, 2081)


In [6]:
vectorizer = TfidfVectorizer()
XtfIdf = vectorizer.fit_transform(data['Phrase'])
print("Words Vectorizer:", XtfIdf.shape)

Words Vectorizer: (5000, 2081)


# Accuracy for wordsbag.

In [7]:
xTrain, xTest, yTrain, yTest = train_test_split(Xwordsbag, y,test_size=0.2)
trainwordsbag, testwordsbag, labeltrainwordsbag, labeltestwordsbag = train_test_split(xTest, yTest, test_size=0.3)

In [8]:
clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
clf.fit(trainwordsbag, labeltrainwordsbag)
print("Logistic Regression: {0}%".format(accuracy_score(labeltestwordsbag, clf.predict(testwordsbag))*100))

Logistic Regression: 62.0%


In [9]:
svcwordsbag = SVC()
svcwordsbag.fit(trainwordsbag, labeltrainwordsbag)
print("SVC: {0}%".format(accuracy_score(labeltestwordsbag, svcwordsbag.predict(testwordsbag))*100))

SVC: 59.0%


In [10]:
classifier = MultinomialNB()
classifier.fit(xTrain.todense(), yTrain)
predicted = classifier.predict(xTest.todense())
print ("NaiveBayes:{}%".format(accuracy_score(yTest, predicted)*100))

NaiveBayes:64.0%


# Accuracy for vectorizer.

In [11]:
xTrain, xTest, yTrain, yTest = train_test_split(XtfIdf, y,test_size=0.2)
trainTFIDF, testTFIDF, labeltrainTFIDF, labeltestTFIDF = train_test_split(xTest, yTest, test_size=0.3)

In [12]:
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
lg.fit(trainTFIDF, labeltrainTFIDF)
print("Logistic Regression: {0}%".format(accuracy_score(labeltestTFIDF, lg.predict(testTFIDF))*100))

Logistic Regression: 61.0%


In [13]:
svcvectorizer = SVC()
svcvectorizer.fit(trainTFIDF, labeltrainTFIDF)
print("SVC: {0}%".format(accuracy_score(labeltestTFIDF, svcvectorizer.predict(testTFIDF))*100))

SVC: 61.0%


In [14]:
classifier = MultinomialNB()
classifier.fit(xTrain.todense(), yTrain)
predicted = classifier.predict(xTest.todense())
print ("NaiveBayes:{}%".format(accuracy_score(yTest, predicted)*100))

NaiveBayes:64.4%
