In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafiansari/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rafiansari/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
train_data = pd.read_csv('twitter_training.csv')
valid_data = pd.read_csv('twitter_validation.csv')

print(train_data.head())
print(valid_data.head())


   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     
   3364   Facebook Irrelevant  \
0   352     Amazon    Neutral   
1  8312  Microsoft   Negative   
2  4371      CS-GO   Negative   
3  4433     Google    Neutral   
4  6273       FIFA   Negative   

  I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣  
0

In [3]:
train_data.columns = ['ID', 'Game', 'Reviews', 'Comments']
valid_data.columns = ['ID', 'Game', 'Reviews', 'Comments']

In [4]:
# Check the column names of the DataFrame
print(train_data.columns)

# Check the column names of the DataFrame
print(valid_data.columns)

Index(['ID', 'Game', 'Reviews', 'Comments'], dtype='object')
Index(['ID', 'Game', 'Reviews', 'Comments'], dtype='object')


In [7]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

import numpy as np

def preprocess(text):

    # Convert null/NaN values to empty strings
    if isinstance(text, float) and np.isnan(text):
        text = ''
    # Convert text to lowercase
    text = text.lower()

    # Tokenize text
    words = word_tokenize(text)

    # Remove stop words
    words = [word for word in words if word not in stop_words]

    # Stem words
    words = [stemmer.stem(word) for word in words]

    # Join words
    text = ' '.join(words)

    return text

train_data['Comments'] = train_data['Comments'].apply(preprocess)
valid_data['Comments'] = valid_data['Comments'].apply(preprocess)


In [9]:
# Feature extraction using TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=10000)
X_train = tfidf.fit_transform(train_data['Comments'])
X_valid = tfidf.transform(valid_data['Comments'])
y_train = train_data['Reviews']
y_valid = valid_data['Reviews']

# Train a logistic regression model on the training data
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict the sentiment of the validation data using the trained model
y_pred = lr.predict(X_valid)

# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_valid, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)



Accuracy: 0.8588588588588588


In [10]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier
dt = DecisionTreeClassifier()

# Train the model on the training data
dt.fit(X_train, y_train)

# Predict the sentiment of the validation data using the trained model
y_pred = dt.predict(X_valid)

# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_valid, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)


Accuracy: 0.8858858858858859


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier with 100 trees
rf = RandomForestClassifier(n_estimators=100)

# Train the model on the training data
rf.fit(X_train, y_train)

# Predict the sentiment of the validation data using the trained model
y_pred = rf.predict(X_valid)

# Calculate the accuracy of the model on the validation data
accuracy = accuracy_score(y_valid, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)


Accuracy: 0.9369369369369369


In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# define the parameter grid to search over
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'max_iter': [100, 500, 1000, 5000]}

# create a logistic regression object
lr = LogisticRegression()

# create a grid search object
grid_search = GridSearchCV(lr, param_grid, cv=5, scoring='accuracy')

# fit the grid search to the data
grid_search.fit(X_train, y_train)

# print the best hyperparameters
print("Best hyperparameters: ", grid_search.best_params_)

# print the accuracy score for the best hyperparameters
best_lr = grid_search.best_estimator_
best_lr.fit(X_train, y_train)
y_pred = best_lr.predict(X_valid)
accuracy = accuracy_score(y_valid, y_pred)
print("Accuracy:", accuracy)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best hyperparameters:  {'C': 0.1, 'max_iter': 100, 'penalty': 'l2'}
Accuracy: 0.7047047047047047


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
