In [19]:
import pandas as pd
import os
from google.colab import files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/train.csv")

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set (stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [22]:
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/test.csv")

In [23]:
lemmatizer = WordNetLemmatizer()

In [24]:
X_train, X_test, y_train, y_test = train_test_split(train.review, train.label, test_size=0.2, random_state=42)

In [25]:
def preprocess_text(text):
      text = re.sub(r"[^a-zA-Z0-9]", " ", text)
      text = text.lower()
      words = nltk.word_tokenize(text)
      words = ' '.join([lemmatizer.lemmatize(w) for w in words])
      return text

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
X_train = X_train.apply(preprocess_text)
pipeline = Pipeline([
    ('vect', TfidfVectorizer(max_features=10000, ngram_range=(1, 2))),
    ('clf', LinearSVC())
])

In [27]:
parameter = {
    'vect__max_df': [0.5, 0.75, 1.0],
    'clf__C': [0.1, 1, 10]
}

In [16]:
grid_search = GridSearchCV(pipeline, param_grid = parameter, cv=5)
grid_search.fit(train.review, train.label)


In [28]:
grid_search.best_score_

0.8752000000000001

In [29]:
svm_prediction = grid_search.predict(test.review)

In [30]:
prediction_df = pd.DataFrame({"Id": test.Id, "Category": svm_prediction})

In [31]:
prediction_df.head(10)

Unnamed: 0,Id,Category
0,0,1
1,1,1
2,2,0
3,3,0
4,4,1
5,5,1
6,6,0
7,7,1
8,8,1
9,9,1


In [32]:
prediction_df.to_csv("svm_prediction.csv", index=False)

In [33]:
files.download("svm_prediction.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>