# TF-IDF Vectorizer

In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Read the data

In [2]:
df = pd.read_csv("../../data/cleaned_reviews.tsv", sep="\t")

## Defining a helpful review + Splitting the data

In [3]:
split = 0.1

df["helpful"] = np.where(df.voteSuccess >= split, 1, 0)

x_train, x_test, y_train, y_test = train_test_split(df.reviewText, df.helpful, test_size=0.25, random_state=30)

## Vectorization with TF-IDF

In [4]:

vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=0.01)
tfidf_train = vectorizer.fit_transform(x_train.values.astype('U'))
tfidf_test = vectorizer.transform(x_test.values.astype('U'))

## Fitting

In [5]:
clf = LinearSVC(random_state=0, max_iter=10000)

clf.fit(tfidf_train, y_train)
y_test_pred = clf.predict(tfidf_test)

## Result

In [6]:
print("Document-term Matrix(Count Vectorizer) - SVM/SVC")
print(classification_report(y_test, y_test_pred, target_names=["Unhelpful", "Helpful"]))

Document-term Matrix(Count Vectorizer) - SVM/SVC
              precision    recall  f1-score   support

   Unhelpful       0.91      0.99      0.95      5769
     Helpful       0.63      0.10      0.17       634

    accuracy                           0.91      6403
   macro avg       0.77      0.55      0.56      6403
weighted avg       0.88      0.91      0.87      6403

