In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

df = pd.read_csv(r"C:\Python Codes\Code\Code\ChatGPT-sentiment-analysis\processed_file.csv")

df.head()

Unnamed: 0,num_labels,processed_tweets
0,1,alright ChatGPT AI pretty neat @nocontextvarg
1,1,ok definitely go replace soon importantly chat...
2,0,ask chatgpt explain 4 famous chess quote Timma...
3,1,chatgpt popular good
4,0,write ida plugin query chatgpt explain decompi...


In [2]:
df = df.dropna().drop_duplicates()

In [3]:
X = df.processed_tweets
y = df.num_labels

# Models:
- KNN
- Gradient Boosting
- Multinomial NB
- Random Forest
- Logistic Regression
- Decision Tree Classifier

# 1. Multinomial NB:

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

mnb_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('mnb', MultinomialNB())
])

params = {
    "mnb__alpha": [True, False]
}

grid_search_mnb_tfidf = GridSearchCV(
    mnb_tfidf_pipeline,
    param_grid=params,
    cv=10,
    scoring="accuracy",
    n_jobs=4
)

grid_search_mnb_tfidf.fit(X,y)

In [5]:
results = []

In [6]:
print("Best Hyperparameters: ", grid_search_mnb_tfidf.best_params_)
print("Best Accuracy: ", grid_search_mnb_tfidf.best_score_)

results.append(
    (
        "mnb",
        grid_search_mnb_tfidf.best_params_,
        grid_search_mnb_tfidf.best_score_
    )
)

Best Hyperparameters:  {'mnb__alpha': True}
Best Accuracy:  0.5873312011371713


# 2. Random Forest Classifier

In [7]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

rf_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier(random_state=42))
])

params = {
    "rf__n_estimators": range(1,30,5),
    "rf__criterion": ["gini", "entropy"]
}

grid_search_rf_tfidf = GridSearchCV(
    rf_tfidf_pipeline,
    param_grid=params,
    cv=10,
    scoring="accuracy",
    n_jobs=4
)

grid_search_rf_tfidf.fit(X, y)

In [8]:
print("Best Hyperparameters: ", grid_search_rf_tfidf.best_params_)
print("Best Accuracy: ", grid_search_rf_tfidf.best_score_)

results.append(
    (
        "rf",
        grid_search_rf_tfidf.best_params_,
        grid_search_rf_tfidf.best_score_
    )
)

Best Hyperparameters:  {'rf__criterion': 'gini', 'rf__n_estimators': 26}
Best Accuracy:  0.7203802416488985


# 3. Gradient Boosting Classifier

In [9]:
gb_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('gb', GradientBoostingClassifier(random_state=0))
])

params = {
    "gb__n_estimators": range(1,30,5),
    "gb__criterion": ["friedman_mse", "squared_error"]
}

grid_search_gb_tfidf = GridSearchCV(
    gb_tfidf_pipeline,
    param_grid=params,
    cv=10,
    scoring="accuracy",
    n_jobs=4
)

grid_search_gb_tfidf.fit(X, y)

In [10]:
print("Best Hyperparameters: ", grid_search_gb_tfidf.best_params_)
print("Best Accuracy: ", grid_search_gb_tfidf.best_score_)

results.append(
    (
        "gb",
        grid_search_gb_tfidf.best_params_,
        grid_search_gb_tfidf.best_score_
    )
)

Best Hyperparameters:  {'gb__criterion': 'friedman_mse', 'gb__n_estimators': 26}
Best Accuracy:  0.5554371002132197


# 4. KNN

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('knn', KNeighborsClassifier())
])

params = {
    "knn__n_neighbors": range(1,30,5),
    "knn__weights": ["uniform", "distance"],
    "knn__algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
}

grid_search_knn_tfidf = GridSearchCV(
    knn_tfidf_pipeline,
    param_grid=params,
    cv=10,
    scoring="accuracy",
    n_jobs=4
)

grid_search_knn_tfidf.fit(X, y)

In [12]:
print("Best Hyperparameters: ", grid_search_knn_tfidf.best_params_)
print("Best Accuracy: ", grid_search_knn_tfidf.best_score_)

results.append(
    (
        "knn",
        grid_search_knn_tfidf.best_params_,
        grid_search_knn_tfidf.best_score_
    )
)

Best Hyperparameters:  {'knn__algorithm': 'auto', 'knn__n_neighbors': 26, 'knn__weights': 'distance'}
Best Accuracy:  0.523454157782516


# 5. Decision Tree Classifier

In [13]:
from sklearn.tree import DecisionTreeClassifier

dt_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('dt', DecisionTreeClassifier(random_state=42))
])

params = {
    "dt__criterion": ["gini", "entropy"],
    "dt__splitter": ["best", "random"]
}

grid_search_dt_tfidf = GridSearchCV(
    dt_tfidf_pipeline,
    param_grid=params,
    cv=10,
    scoring="accuracy",
    n_jobs=4
)

grid_search_dt_tfidf.fit(X, y)

In [14]:
print("Best Hyperparameters: ", grid_search_dt_tfidf.best_params_)
print("Best Accuracy: ", grid_search_dt_tfidf.best_score_)

results.append(
    (
        "dt",
        grid_search_dt_tfidf.best_params_,
        grid_search_dt_tfidf.best_score_
    )
)

Best Hyperparameters:  {'dt__criterion': 'gini', 'dt__splitter': 'random'}
Best Accuracy:  0.6937277896233119


# 6. Logistic Regression Classifier

In [15]:
from sklearn.linear_model import LogisticRegression

lr_tfidf_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lr', LogisticRegression(random_state=42)) 
])

params = {
    "lr__penalty": ["l1", "l2", "elasticnet", None],
    "lr__dual": [True, False],
    "lr__C": range(1, 30, 5)
}

grid_search_lr_tfidf = GridSearchCV(
    lr_tfidf_pipeline,
    param_grid=params,
    cv=10,
    scoring="accuracy",
    n_jobs=4
)

grid_search_lr_tfidf.fit(X, y)

360 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\raiya\anaconda3\envs\dl_gpu\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\raiya\anaconda3\envs\dl_gpu\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "C:\Users\raiya\anaconda3\envs\dl_gpu\lib\site-packages\sklearn\pipeline.py", line 427, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\raiya\anaconda3\envs\dl_gpu\lib\site-packages\sklearn\base.py", line

In [16]:
print("Best Hyperparameters: ", grid_search_lr_tfidf.best_params_)
print("Best Accuracy: ", grid_search_lr_tfidf.best_score_)

results.append(
    (
        "lr",
        grid_search_lr_tfidf.best_params_,
        grid_search_lr_tfidf.best_score_
    )
)

Best Hyperparameters:  {'lr__C': 11, 'lr__dual': False, 'lr__penalty': 'l2'}
Best Accuracy:  0.7799395877754087


In [17]:
results

[('mnb', {'mnb__alpha': True}, 0.5873312011371713),
 ('rf', {'rf__criterion': 'gini', 'rf__n_estimators': 26}, 0.7203802416488985),
 ('gb',
  {'gb__criterion': 'friedman_mse', 'gb__n_estimators': 26},
  0.5554371002132197),
 ('knn',
  {'knn__algorithm': 'auto',
   'knn__n_neighbors': 26,
   'knn__weights': 'distance'},
  0.523454157782516),
 ('dt',
  {'dt__criterion': 'gini', 'dt__splitter': 'random'},
  0.6937277896233119),
 ('lr',
  {'lr__C': 11, 'lr__dual': False, 'lr__penalty': 'l2'},
  0.7799395877754087)]

In [19]:
results_without_params = [(i[0], i[2]) for i in results]

In [20]:
df_results = pd.DataFrame(results_without_params, columns=['Model', 'Score'])

In [21]:
df_results

Unnamed: 0,Model,Score
0,mnb,0.587331
1,rf,0.72038
2,gb,0.555437
3,knn,0.523454
4,dt,0.693728
5,lr,0.77994
