In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:


# Load the training data
train_data = pd.read_csv('moved_imdb_reviews_small_lemm_train.tsv', sep='\t')

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(train_data['review_lemm'], train_data['pos'], test_size=0.2, random_state=42)

# Convert the lemmatized reviews into feature vectors
vectorizer = TfidfVectorizer()
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)

# Train a logistic regression model
clf = LogisticRegression()
clf.fit(train_features, train_labels)

# Evaluate the performance of the model on the validation set
val_preds = clf.predict(val_features)
val_acc = accuracy_score(val_labels, val_preds)
print('Validation accuracy:', val_acc)

# Load the test data
test_data = pd.read_csv('moved_imdb_reviews_small_lemm_test.tsv', sep='\t')

# Convert the lemmatized reviews in the test set into feature vectors
test_features = vectorizer.transform(test_data['review_lemm'])

# Use the trained model to predict the tonality of the test reviews
test_preds = clf.predict(test_features)

# Save the predicted tonalities to the 'pos' column of the
test_data['pos'] = test_preds
test_data.to_csv('predictions', index=False)


Validation accuracy: 0.8349753694581281
