<a href="https://www.kaggle.com/code/rafiansari/twitter-sentiment-analysis-on-game-reviews?scriptVersionId=123049240" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer


In [None]:
train_data = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv')
valid_data = pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv')

print(train_data.head())
print(valid_data.head())


In [None]:
train_data.columns = ['ID', 'Game', 'Reviews', 'Comments']
valid_data.columns = ['ID', 'Game', 'Reviews', 'Comments']

In [None]:
# Check the column names of the DataFrame
print(train_data.columns)

# Check the column names of the DataFrame
print(valid_data.columns)

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

import numpy as np

def preprocess(text):

    # Convert null/NaN values to empty strings
    if isinstance(text, float) and np.isnan(text):
        text = ''
    # Convert text to lowercase
    text = text.lower()

    # Tokenize text
    words = word_tokenize(text)

    # Remove stop words
    words = [word for word in words if word not in stop_words]

    # Stem words
    words = [stemmer.stem(word) for word in words]

    # Join words
    text = ' '.join(words)

    return text

train_data['Comments'] = train_data['Comments'].apply(preprocess)
valid_data['Comments'] = valid_data['Comments'].apply(preprocess)


In [None]:
# Feature extraction using TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(train_data['Comments'])
X_valid = tfidf.transform(valid_data['Comments'])
y_train = train_data['Reviews']
y_valid = valid_data['Reviews']

# Train a logistic regression model on the training data
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict the sentiment of the validation data using the trained model
y_pred = lr.predict(X_valid)

# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_valid, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)



In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
dt = DecisionTreeClassifier()

# Train the model on the training data
dt.fit(X_train, y_train)

# Predict the sentiment of the validation data using the trained model
y_pred = dt.predict(X_valid)

# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_valid, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier with 100 trees
rf = RandomForestClassifier(n_estimators=100)

# Train the model on the training data
rf.fit(X_train, y_train)

# Predict the sentiment of the validation data using the trained model
y_pred = rf.predict(X_valid)

# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_valid, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)


****You can use hyperparameter tuning on LR to see if accuracy has any change****

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define the parameter grid to search over
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'max_iter': [100, 500, 1000, 5000]}

# create a logistic regression object
lr = LogisticRegression()

# create a grid search object
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')

# fit the grid search to the data
grid_search.fit(X_train, y_train)

# print the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

# print the accuracy score for the best hyperparameters
best_lr = grid_search.best_estimator_
best_lr.fit(X_train, y_train)
y_pred = best_lr.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)
