### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# NLP libraries to clean the text data
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Vectorization technique TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
data = pd.read_csv(r'format.csv',header=None)

train = pd.read_csv(r'train.csv', header=None)
test = pd.read_csv(r'test.csv', header=None)


In [5]:
# adding header
headerList = ['Sentiment', 'Text']
  
# converting data frame to csv
train.to_csv("new_train.csv", header=headerList, index=False)
test.to_csv("new_test.csv",header=['Text'],index=False)


In [6]:
#Loading New data
train = pd.read_csv(r'new_train.csv')
test = pd.read_csv(r'new_test.csv')

In [7]:
train.head(5)

Unnamed: 0,Sentiment,Text
0,-1,"Eat at Fioris, they said. Youll like it, they..."
1,-1,I just don't understand the appeal. I've trie...
2,1,This is my go to place for a really good beef ...
3,-1,"Not impressed. When I ordered the Oyako bowl, ..."
4,-1,This is the first time ever I wrote a bad revi...


In [136]:
train.shape

(18000, 2)

In [137]:
test.shape

(18000, 1)

In [138]:
train.columns

Index(['Sentiment', 'Text'], dtype='object')

In [8]:
# checking the number of positive and negative sentiment in training data
train['Sentiment'].value_counts()

-1    9000
 1    9000
Name: Sentiment, dtype: int64

#### Text Processing

a. Remove symbols(',','-',...etc)

b. Remove stop words

c. Stemming

In [9]:
ps = PorterStemmer()
def wordopt(text):
    text = re.sub('[^a-zA-Z]', ' ',text)
    text = text.lower()
    text = text.split()
    text = [ps.stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [10]:
train['Text'] = train['Text'].apply(wordopt) #Applying the text processing techniques onto every row data

In [11]:
train.head(5)

Unnamed: 0,Sentiment,Text
0,-1,eat fiori said youll like said n ni conveni lo...
1,-1,understand appeal tri place twice think someho...
2,1,go place realli good beef enchilada red sauc a...
3,-1,impress order oyako bowl convers went someth l...
4,-1,first time ever wrote bad review frustrat hert...


### Splitting Data

In [30]:
X=train['Text']
Y=train['Sentiment']

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=0)

In [31]:
X_train.shape, y_train.shape

((12600,), (12600,))

### Vectorization of Text Data

In [32]:
vectorization = TfidfVectorizer()
Xv_train = vectorization.fit_transform(X_train)
Xv_test = vectorization.transform(X_test)

### Performing KNN Classification

In [23]:
knn=KNeighborsClassifier(n_neighbors=20,)

In [33]:
knn.fit(Xv_train,y_train)

KNeighborsClassifier(n_neighbors=20)

In [34]:
knn.score(Xv_test,y_test)

0.7955555555555556

In [36]:
# Test Data
test['Text'] = test['Text'].apply(wordopt)

In [37]:
t = test['Text']
x_test = vectorization.transform(t)

In [38]:
knn.predict(x_test)

array([-1,  1, -1, ...,  1,  1,  1], dtype=int64)