In [58]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import re


In [59]:
# Load datasets
train_data = pd.read_csv('/content/train_nlp.csv')
test_data = pd.read_csv('/content/test_nlp.csv')

In [60]:
# Inspect datasets
train_data.head()
test_data.head()
train_data.info()
test_data.info()
train_data.describe()
test_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44798 entries, 0 to 44797
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            44798 non-null  int64 
 1   Review_Title  44798 non-null  object
 2   Review        44798 non-null  object
 3   Rating        44798 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14932 entries, 0 to 14931
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            14932 non-null  int64 
 1   Review_Title  14932 non-null  object
 2   Review        14932 non-null  object
dtypes: int64(1), object(2)
memory usage: 350.1+ KB


Unnamed: 0,ID
count,14932.0
mean,8186.164747
std,4726.046432
min,0.0
25%,4086.75
50%,8169.5
75%,12264.25
max,16374.0


In [61]:
# Combine and preprocess text data
def preprocess_text(title, review):
    # Combine title and review
    text = f"{title} {review}"
    # Remove special characters, numbers, and extra whitespace
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()

In [62]:
# Apply preprocessing to training data
train_data['Combined_Text'] = train_data.apply(
    lambda row: preprocess_text(row['Review_Title'], row['Review']), axis=1)

In [63]:
# Apply preprocessing to test data
test_data['Combined_Text'] = test_data.apply(
    lambda row: preprocess_text(row['Review_Title'], row['Review']), axis=1)


In [64]:
# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['Combined_Text'], train_data['Rating'], test_size=0.2, random_state=42)


In [65]:
# Convert text to numerical features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)
X_test_tfidf = tfidf.transform(test_data['Combined_Text'])


In [66]:
# Train a logistic regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [67]:
# Predict on validation data
val_predictions = model.predict(X_val_tfidf)
f1 = f1_score(y_val, val_predictions)

print(f"Validation F1-score: {f1:.4f}")


Validation F1-score: 0.9884


In [68]:
# Predict ratings for the test dataset
test_predictions = model.predict(X_test_tfidf)

In [69]:
# Create submission file
submission = test_data[['ID']].copy()
submission['Rating'] = test_predictions

In [70]:
# Save the submission file
submission_path = '/content/submission_nlp.csv'
submission.to_csv(submission_path, index=False)

print(f"Submission file saved at {submission_path}")

Submission file saved at /content/submission_nlp.csv
