In [31]:
import numpy as np # to use numpy arrays instead of lists
import pandas as pd # DataFrame (table)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

ModuleNotFoundError: No module named 'matplotlib'

In [26]:
def readData():

    dataSet = pd.read_csv('archives/archive1.zip', header = 0, compression = 'zip')
    return dataSet

def encodeData(dataSet):

    dataSet.dropna(inplace=True)

    # Dropping irrelevant columns 'Year', 'Month', 'Day', 'Time of Tweet', 'Platform' 
    dataSet.drop(['Year', 'Month', 'Day', 'Time of Tweet', 'Platform'], axis=1, inplace=True)

    # Add 'neutral' sentiment to 'negative' sentiment
    dataSet['sentiment'] = dataSet['sentiment'].replace('neutral', 'negative')

    # mapping the sentiment to 0 and 1
    dataSet['sentiment'] = dataSet['sentiment'].map({'positive': 1, 'negative': 0})
    return dataSet

# Split the data into training and testing sets
def splitData(dataSet):

    X = dataSet['text']
    y = dataSet['sentiment']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# Vectorize the data | Proccess the data
def vectorizeData(X_train, X_test):
    
    vectorizer = TfidfVectorizer(max_features=1000)
    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)
    return [X_train, X_test]


In [29]:
def metodosML(train_vectors, y_train, test_vectors, y_test):
    methodsUsed = ['SVM', 'DT', 'LR', 'RF', 'NN']
    performanceHeaders = ['precision', 'recall', 'f1-score']
    modPerformancePos = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)
    modPerformanceNeg = pd.DataFrame(index=methodsUsed, columns=performanceHeaders)

    # SVM
    print('Classification with SVM')
    svm_clf = svm.SVC()
    param_search_svm = {'kernel': ["poly"], 'degree': [1, 2, 3, 4], 'coef0': [1, 2]}
    grid_search_svm = GridSearchCV(estimator=svm_clf, param_grid=param_search_svm, cv=5, verbose=1)
    grid_search_svm.fit(train_vectors, y_train)
    best_clf_svm = grid_search_svm.best_estimator_
    svm_prediction = best_clf_svm.predict(test_vectors)
    report = classification_report(y_test, svm_prediction, output_dict=True)
    modPerformancePos.loc['SVM'] = [report['1']['precision'], report['1']['recall'], report['1']['f1-score']]
    modPerformanceNeg.loc['SVM'] = [report['0']['precision'], report['0']['recall'], report['0']['f1-score']]
    print("SVM accuracy: ", round(report['accuracy'], 2))

    # Decision Tree
    print('Classification with DT')
    dt_clf = DecisionTreeClassifier()
    param_search_clf = {'criterion': ["gini", 'entropy'], 'max_depth': [5, 10, 20, 30, None]}
    grid_search_dt = GridSearchCV(estimator=dt_clf, param_grid=param_search_clf, cv=5, verbose=1)
    grid_search_dt.fit(train_vectors, y_train)
    best_clf_dt = grid_search_dt.best_estimator_
    dt_prediction = best_clf_dt.predict(test_vectors)
    report = classification_report(y_test, dt_prediction, output_dict=True)
    modPerformancePos.loc['DT'] = [report['1']['precision'], report['1']['recall'], report['1']['f1-score']]
    modPerformanceNeg.loc['DT'] = [report['0']['precision'], report['0']['recall'], report['0']['f1-score']]
    print("DT accuracy: ", round(report['accuracy'], 2))

    # Logistic Regression
    print('Classification with Logistic Regression')
    lr_clf = LogisticRegression(max_iter=1000)
    lr_clf.fit(train_vectors, y_train)
    lr_prediction = lr_clf.predict(test_vectors)
    report = classification_report(y_test, lr_prediction, output_dict=True)
    modPerformancePos.loc['LR'] = [report['1']['precision'], report['1']['recall'], report['1']['f1-score']]
    modPerformanceNeg.loc['LR'] = [report['0']['precision'], report['0']['recall'], report['0']['f1-score']]
    print("LR accuracy: ", round(report['accuracy'], 2))

    # Random Forest
    print('Classification with Random Forest')
    rf_clf = RandomForestClassifier(n_estimators=100)
    rf_clf.fit(train_vectors, y_train)
    rf_prediction = rf_clf.predict(test_vectors)
    report = classification_report(y_test, rf_prediction, output_dict=True)
    modPerformancePos.loc['RF'] = [report['1']['precision'], report['1']['recall'], report['1']['f1-score']]
    modPerformanceNeg.loc['RF'] = [report['0']['precision'], report['0']['recall'], report['0']['f1-score']]
    print("RF accuracy: ", round(report['accuracy'], 2))

    # Neural Network
    print('Classification with Neural Network')
    model = Sequential()
    model.add(Dense(512, activation='relu', input_shape=(train_vectors.shape[1],)))
    model.add(Dropout(0.5))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Display model summary
    model.summary()

    # Train the model and plot training history
    history = model.fit(train_vectors.toarray(), y_train, epochs=5, batch_size=512, validation_split=0.1, verbose=1)

    # Plot training & validation accuracy values
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    # Plot training & validation loss values
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'], loc='upper left')

    plt.show()

    nn_prediction = (model.predict(test_vectors.toarray()) > 0.5).astype("int32")
    report = classification_report(y_test, nn_prediction, output_dict=True)
    modPerformancePos.loc['NN'] = [report['1']['precision'], report['1']['recall'], report['1']['f1-score']]
    modPerformanceNeg.loc['NN'] = [report['0']['precision'], report['0']['recall'], report['0']['f1-score']]
    print("NN accuracy: ", round(report['accuracy'], 2))

    print(modPerformancePos)
    print(modPerformanceNeg)


In [30]:
def main():
    datos = readData()
    encodeDatos = encodeData(datos)
    X_train, X_test, y_train, y_test = splitData(encodeDatos)
    train_vectors, test_vectors = vectorizeData(X_train, X_test)
    metodosML(train_vectors, y_train, test_vectors, y_test)

main()

Classification with SVM
Fitting 5 folds for each of 8 candidates, totalling 40 fits
SVM accuracy:  0.82
Classification with DT
Fitting 5 folds for each of 10 candidates, totalling 50 fits
DT accuracy:  0.79
Classification with Logistic Regression
LR accuracy:  0.73
Classification with Random Forest
RF accuracy:  0.86
Classification with Neural Network
Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5432 - loss: 0.6923 - val_accuracy: 0.6250 - val_loss: 0.6817
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step - accuracy: 0.6630 - loss: 0.6734 - val_accuracy: 0.6250 - val_loss: 0.6727
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.6741 - loss: 0.6604 - val_accuracy: 0.6250 - val_loss: 0.6647
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.6741 - loss: 0.6465 - val_accuracy: 0.6250 - val_loss: 0.6581
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.6741 - loss: 0.6307 - val_accuracy: 0.6250 - val_loss: 0.6526
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step 
NN accuracy:  0.66

Positive Sentiment
    precision    recall  f1-score
SVM  0.863636  0.558824  0.678571
DT    0.69697  0.676471  0.686567
LR       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Tables and Graphs