In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('./data/train.csv', header=None, nrows=10000)
df.columns = ['Review', 'Title', 'Text']
df.head(6)

Unnamed: 0,Review,Title,Text
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."
5,2,an absolute masterpiece,I am quite sure any of you actually taking the...


In [4]:
df.shape

(10000, 3)

### **Step 1 - Remove punctuation marks and Convert to Lowercase**

In [5]:
# start = 50030    # to debug
# end = 50041

In [6]:
import re

# converting to a list and lowercase
# for i in range(len(df)):
#     df.loc[i, 'Title'] = re.sub('[^a-zA-Z]', ' ', str(df.loc[i, 'Title'])).lower()                                                                                                                                                        
#     df.loc[i, 'Text'] = re.sub('[^a-zA-Z]', ' ', str(df.loc[i, 'Text'])).lower()
df['Title'] = df['Title'].astype(str).apply(lambda x: re.sub('[^a-zA-Z]', ' ', x).lower())
df['Text'] = df['Text'].astype(str).apply(lambda x: re.sub('[^a-zA-Z]', ' ', x).lower())

In [7]:
df.head()

Unnamed: 0,Review,Title,Text
0,2,stuning even for the non gamer,this sound track was beautiful it paints the ...
1,2,the best soundtrack ever to anything,i m reading a lot of reviews saying that this ...
2,2,amazing,this soundtrack is my favorite music of all ti...
3,2,excellent soundtrack,i truly like this soundtrack and i enjoy video...
4,2,remember pull your jaw off the floor after he...,if you ve played the game you know how divine...


### **Step 2 - Remove Stopwords and convet to list**

In [8]:
from nltk.corpus import stopwords

notStopwords = ['not']          # 'not' is a stopword in this dataset 
stopwords = {word for word in stopwords.words('english') if word not in notStopwords}

In [9]:
# # use Bagging

# for i in range(len(df)):
#     title:str = df['Title'].iloc[i]
#     text:str = df['Text'].iloc[i]

#     wordList = []
#     for word in title.split():
#         if word not in stopwords:
#             wordList.append(word)
#     df.at[i, 'Title'] = wordList

#     wordList = []
#     for word in text.split():
#         if word not in stopwords:
#             wordList.append(word)    
#     df.at[i, 'Text'] = wordList
#     # print(type(title),'\n',type(text))

In [10]:
# use Bagging - above code cell does the same but it takes a lot of time as apply() of pandas is faster than for loop

def removeStopwordsAndConvertToArray(text):
    if isinstance(text,float):
        return []
    
    return [word for word in text.split() if word not in stopwords]

df['Title'] = df['Title'].astype(str).apply(removeStopwordsAndConvertToArray)
df['Text'] = df['Text'].astype(str).apply(removeStopwordsAndConvertToArray)

In [11]:
# df.iloc[start:end]

In [12]:
df.head()

Unnamed: 0,Review,Title,Text
0,2,"[stuning, even, non, gamer]","[sound, track, beautiful, paints, senery, mind..."
1,2,"[best, soundtrack, ever, anything]","[reading, lot, reviews, saying, best, game, so..."
2,2,[amazing],"[soundtrack, favorite, music, time, hands, int..."
3,2,"[excellent, soundtrack]","[truly, like, soundtrack, enjoy, video, game, ..."
4,2,"[remember, pull, jaw, floor, hearing]","[played, game, know, divine, music, every, sin..."


In [13]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0].astype('int8')

### **Step 3 - Stemming**

In [14]:
from nltk.stem.porter import PorterStemmer

In [15]:
ps = PorterStemmer()
myStemBag = [[], []]        # index 0->Title,1->Text

def stemming(x: list[str], index: int):
    shortBag = []
    for word in x:
        stem_word = ps.stem(word)
        if stem_word not in shortBag:
            shortBag.append(stem_word)

    myStemBag[index].append(shortBag)

X['Title'].apply(lambda x: stemming(x, 0))
X['Text'].apply(lambda x: stemming(x, 1))

# myStemBag

0       None
1       None
2       None
3       None
4       None
        ... 
9995    None
9996    None
9997    None
9998    None
9999    None
Name: Text, Length: 10000, dtype: object

In [16]:
X['Title'].apply(lambda x: ' '.join(x))
X['Text'].apply(lambda x: ' '.join(x))

0       sound track beautiful paints senery mind well ...
1       reading lot reviews saying best game soundtrac...
2       soundtrack favorite music time hands intense s...
3       truly like soundtrack enjoy video game music p...
4       played game know divine music every single son...
                              ...                        
9995    though scholarly biography book reveals much l...
9996    biography arthur aull small town journalist tu...
9997    hard pressed tell boring story plucky country ...
9998    box looked used obviously not new tried contac...
9999    pen shipped promptly classic montblanc pen eve...
Name: Text, Length: 10000, dtype: object

In [17]:
# X['Title'] = X['Title'].apply(lambda x: ' '.join(x))  # Convert list of list to list of string for count vectorizer
# X['Text'] = X['Text'].apply(lambda x: ' '.join(x))  # Convert list of list to list of string for count vectorizer

In [18]:
X.head()

Unnamed: 0,Title,Text
0,"[stuning, even, non, gamer]","[sound, track, beautiful, paints, senery, mind..."
1,"[best, soundtrack, ever, anything]","[reading, lot, reviews, saying, best, game, so..."
2,[amazing],"[soundtrack, favorite, music, time, hands, int..."
3,"[excellent, soundtrack]","[truly, like, soundtrack, enjoy, video, game, ..."
4,"[remember, pull, jaw, floor, hearing]","[played, game, know, divine, music, every, sin..."


In [19]:
y.head()

0    2
1    2
2    2
3    2
4    2
Name: Review, dtype: int8

In [20]:
X['Title'] = X['Title'].astype(str)
X['Text'] = X['Text'].astype(str)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000)         # or else ram is crashed

X['Title'] = list(cv.fit_transform(X['Title']).toarray().astype(np.int8))  
X['Text'] = list(cv.fit_transform(X['Text']).toarray().astype(np.int8))

In [22]:
X.shape,y.shape

((10000, 2), (10000,))

In [23]:
X.dtypes, y.dtypes

(Title    object
 Text     object
 dtype: object,
 dtype('int8'))

In [24]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

X_title = np.array(X['Title'].tolist())
X_text = np.array(X['Text'].tolist())

X_combined = np.hstack((X_title, X_text))

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, stratify=y)

model = SVC()
model.fit(X_train, y_train)

In [25]:
print('training accuracy: ', model.score(X_train, y_train))
print('testing accuracy: ', model.score(X_test, y_test))

training accuracy:  0.976125
testing accuracy:  0.8435
