In [54]:
import pandas as pd
import numpy as np
df = pd.read_csv('corpus.csv', encoding='latin1')
print(df.head(15))

                                                 text        label
0    Stuning even for the non-gamer: This sound tr...  __label__2 
1    The best soundtrack ever to anything.: I'm re...  __label__2 
2    Amazing!: This soundtrack is my favorite musi...  __label__2 
3    Excellent Soundtrack: I truly like this sound...  __label__2 
4    Remember, Pull Your Jaw Off The Floor After H...  __label__2 
5    an absolute masterpiece: I am quite sure any ...  __label__2 
6    Buyer beware: This is a self-published book, ...  __label__1 
7    Glorious story: I loved Whisper of the wicked...  __label__2 
8    A FIVE STAR BOOK: I just finished reading Whi...  __label__2 
9    Whispers of the Wicked Saints: This was a eas...  __label__2 
10   The Worst!: A complete waste of time. Typogra...  __label__1 
11   Great book: This was a great book,I just coul...  __label__2 
12   Great Read: I thought this book was brilliant...  __label__2 
13   Oh please: I guess you have to be a romance n...  __label

In [55]:
num_rows = len(df)
print("Number of rows:", num_rows)

Number of rows: 10000


In [56]:
#pre-processing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Function to remove punctuation
def remove_punctuation(text):
    punctuation = string.punctuation
    cleaned_text = text.translate(str.maketrans('', '', punctuation))
    return cleaned_text

# Function to remove stop words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

# Iterate over each row using a for loop
for index, row in df.iterrows():
    cleaned_text = remove_punctuation(row['text'])
    cleaned_text = remove_stopwords(cleaned_text)
    df.loc[index, 'cleaned_text'] = cleaned_text


In [57]:
df.head()

Unnamed: 0,text,label,cleaned_text
0,Stuning even for the non-gamer: This sound tr...,__label__2,Stuning even nongamer sound track beautiful pa...
1,The best soundtrack ever to anything.: I'm re...,__label__2,best soundtrack ever anything Im reading lot r...
2,Amazing!: This soundtrack is my favorite musi...,__label__2,Amazing soundtrack favorite music time hands i...
3,Excellent Soundtrack: I truly like this sound...,__label__2,Excellent Soundtrack truly like soundtrack enj...
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2,Remember Pull Jaw Floor Hearing youve played g...


In [75]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf_vect = TfidfVectorizer()
Tfidf_vect.fit(df["cleaned_text"])
Tfidf_matrix = Tfidf_vect.transform(df["cleaned_text"])
print(Tfidf_matrix)
print(Tfidf_matrix.shape)

  (0, 39508)	0.12221469611247483
  (0, 38788)	0.07260986456135858
  (0, 38090)	0.10387638514492085
  (0, 36298)	0.12165095677748702
  (0, 34870)	0.11141032609145377
  (0, 34162)	0.22033739761472357
  (0, 33767)	0.14825557274837042
  (0, 33098)	0.0987406120551005
  (0, 33082)	0.18824292643744853
  (0, 31496)	0.22033739761472357
  (0, 29075)	0.15041073400687754
  (0, 26798)	0.23748736595907363
  (0, 26168)	0.0829677199069675
  (0, 25695)	0.18824292643744853
  (0, 25147)	0.22033739761472357
  (0, 24254)	0.22033739761472357
  (0, 23545)	0.17854582895529747
  (0, 22642)	0.11270282544462244
  (0, 20894)	0.11194012649276325
  (0, 19724)	0.22033739761472357
  (0, 17949)	0.1694688643182667
  (0, 16477)	0.12820272425955975
  (0, 16061)	0.17219569084881103
  (0, 15788)	0.22033739761472357
  (0, 14954)	0.1300017857198499
  :	:
  (9999, 24388)	0.0685825625225324
  (9999, 23214)	0.09898141979780836
  (9999, 23077)	0.1612799862152586
  (9999, 22972)	0.10889920162622028
  (9999, 21226)	0.0564659796773

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])
y = df['encoded_label'].values
X=Tfidf_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)
print(X_train.shape)

  (0, 37904)	0.142001979617248
  (0, 37689)	0.05884209382739533
  (0, 36880)	0.05985929442649011
  (0, 36726)	0.07357598339987492
  (0, 35846)	0.04555459192199217
  (0, 35657)	0.06134186776174988
  (0, 35650)	0.13345954177917874
  (0, 33904)	0.08455951335791492
  (0, 33187)	0.10957728747549571
  (0, 30103)	0.10535520809121537
  (0, 30010)	0.0984228628161237
  (0, 29051)	0.15315640427662003
  (0, 29050)	0.21154643373888002
  (0, 29049)	0.12572285958165288
  (0, 28488)	0.08635607212928496
  (0, 27859)	0.09300820584481523
  (0, 27538)	0.15315640427662003
  (0, 27103)	0.07346185218999049
  (0, 26525)	0.08896394521057596
  (0, 26214)	0.07558634546592673
  (0, 26082)	0.15315640427662003
  (0, 24916)	0.07278121090172962
  (0, 24564)	0.146631484133768
  (0, 24368)	0.09379335829960365
  (0, 24000)	0.11598750506985515
  :	:
  (7999, 15228)	0.05892500846076493
  (7999, 14829)	0.10407319505223846
  (7999, 14715)	0.13329399964184319
  (7999, 13864)	0.06394267050568317
  (7999, 13112)	0.174621205300

In [81]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
model=SVC(kernel='linear')
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_pred,y_test)
print(accuracy)

0.8575
