In [1]:
# Basic Necessities
import pandas as pd

# Model Packages
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import f1_score


In [3]:
train_data = pd.read_csv('data/train_clean_data.csv')
test_data = pd.read_csv('data/test_clean_data.csv')
tweets = list(train_data['clean_text'])
sentiment = list(train_data['sentiment'])

In [4]:
# Splitting into Training and Test Set
X_train, X_test, y_train, y_test = train_test_split(tweets, sentiment, test_size=0.25, random_state=42)

# Tf-Idf Parameter Tuning

In [10]:
# The final result is shown below. Refer the report for the results of various parameters
# Final parameters are :
        # n_gram = (1,2) and min_df and max_df are default values hence not specified 

## Parameter Tuning On Dataset

In [9]:
model = make_pipeline(TfidfVectorizer(ngram_range=(1,2)), LogisticRegression(max_iter=1000))
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f1_score(y_test,y_pred,average=None))

[0.40383198 0.63462356 0.70245989]


## Parameter Tuning on Upsampled Dataset

In [7]:
model = Pipeline([
        ('tf_idf',TfidfVectorizer(ngram_range=(1,2))),
        ('sampling', SMOTE()),
        ('classification', LogisticRegression(max_iter=1000))
    ])
model = model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f1_score(y_test, y_pred,average=None))

[0.5193999  0.61698681 0.69598214]


# Logistic Regression Parameter Tuning

In [11]:
# For Logistic Regression, we focused on the parameter 'C' alone.Hence, we ran a Grid Search for a few different C values 
# The final parameter is:
            # The default value of C=1 is the result. Hence, we don't need to specify it.

## Grid Search 

In [15]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

grid_search_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(ngram_range=(1,2))),
    ('sampling', SMOTE()),
    ('model', LogisticRegression(max_iter=1000)),
])

params = [
    {
        'model__C':[0.01,0.1,1,10,100],
    },
]
grid_search = GridSearchCV(grid_search_pipeline, params, cv=5, scoring='f1_weighted')
grid_search.fit(tweets, sentiment)
print(grid_search.best_params_)

{'model__C': 1}
