# Term Frequency - Inverse Document Frequency

In [18]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize

In [11]:
df = pd.read_csv("/tmp/all-data.csv", 
                 encoding='cp437', 
                 header=None, 
                 names=["sentiment", "text"])
le = LabelEncoder()
df["y"] = le.fit_transform(df["sentiment"])

ps = PorterStemmer() 
def stem_sentence(text):
    return " ".join([ps.stem(word) for word in word_tokenize(text)])

df["processed_text"] = df["text"].map(stem_sentence)

train_df, test_df = train_test_split(df, stratify=df["y"], test_size=0.1)

In [23]:
tfidf_vectorizer = TfidfVectorizer(
                                    min_df=5,
                                    lowercase=True,
                                    stop_words='english')
train_tfidf = tfidf_vectorizer.fit_transform(train_df["processed_text"].values)

In [24]:
model = LogisticRegression(multi_class="multinomial", max_iter=500)
model.fit(train_tfidf, train_df["y"])

test_tfidf = tfidf_vectorizer.transform(test_df["processed_text"])
test_preds = model.predict(test_tfidf)
accuracy_score(test_df["y"], test_preds)

0.7587628865979381