In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
all_tweets = pd.read_csv(f"tweets.csv")

In [None]:
all_tweets.columns = all_tweets.columns.str.strip()
all_categories = all_tweets.political_inclination.unique()
all_categories.sort()
print(list(all_categories))
all_tweets.head()
all_tweets.describe()
all_tweets.groupby('political_inclination').count()

In [None]:
## remove links
import re

all_tweets.text = all_tweets.text.apply(lambda x: re.sub(
    r'RT @[a-zA-Z0-9_]+:|@[a-zA-Z0-9_]+|http\S+', '', x))
print(all_tweets.text.head())


In [None]:
from sklearn.model_selection import train_test_split
y = all_tweets.political_inclination
x = all_tweets.drop("political_inclination", axis=1)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=23, stratify=y)

print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import EnglishStemmer

stemmer = EnglishStemmer()
analyzer = CountVectorizer().build_analyzer()


def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))


stem_vectorizer = CountVectorizer(analyzer=stemmed_words, lowercase=True)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


## Naive Bayes

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# {'clf__alpha': 0.01, 'tfidf__use_idf': False, 'vect__analyzer': < function stemmed_words at 0x12fbda550 >, 'vect__stop_words': None}
nb_pipeline = Pipeline(
    [
        ('vect', stem_vectorizer),
        ('tfidf', TfidfTransformer(use_idf=False)),
        ('clf', MultinomialNB(alpha=0.01))
    ])

In [None]:
text_clf_nb = nb_pipeline.fit(x_train.text, y_train)

In [None]:
predicted = text_clf_nb.predict(x_test.text)

print(classification_report(y_test, predicted))
print(accuracy_score(y_test, predicted))

matrix = confusion_matrix(y_test, predicted,labels=list(all_categories))
print(matrix)

## SGDClassifier

In [None]:
# {'clf-svm__alpha': 0.001, 'clf-svm__loss': 'modified_huber', 'clf-svm__penalty': 'l2', 'tfidf__use_idf': True}

from sklearn.linear_model import SGDClassifier

text_clf_svm = Pipeline(
    [
        ('vect', stem_vectorizer),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf-svm', SGDClassifier(alpha=0.001, loss='modified_huber', penalty='l2'))
    ]
)

_ = text_clf_svm.fit(x_train.text, y_train)


In [None]:
predicted_svm = text_clf_svm.predict(x_test.text)

print(classification_report(y_test, predicted_svm))
print(accuracy_score(y_test, predicted_svm))

matrix = confusion_matrix(y_test, predicted_svm,labels=list(all_categories))
print(matrix)

## Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

text_clf_rf = Pipeline(
    [
        ('vect', stem_vectorizer),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf-rf', RandomForestClassifier(n_estimators=100))
    ]
)

_ = text_clf_rf.fit(x_train.text, y_train)

In [None]:
predicted_rf = text_clf_rf.predict(x_test.text)

print(classification_report(y_test, predicted_rf))
print(accuracy_score(y_test, predicted_rf))

matrix = confusion_matrix(y_test, predicted_rf, labels=list(all_categories))
print(matrix)


## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

text_clf_knn = Pipeline(
    [
        ('vect', stem_vectorizer),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf-knn', KNeighborsClassifier(n_neighbors=7))
    ]
)

_ = text_clf_knn.fit(x_train.text, y_train)


In [None]:
predicted_knn = text_clf_knn.predict(x_test.text)

print(confusion_matrix(y_test, predicted_knn))
print(classification_report(y_test, predicted_knn))
print(accuracy_score(y_test, predicted_knn))

matrix = confusion_matrix(y_test, predicted_knn, labels=list(all_categories))
print(matrix)

## MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

text_clf_mlp = Pipeline(
    [
        ('vect', stem_vectorizer),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf-mlp', MLPClassifier(
            solver='lbfgs', 
            alpha=1e-5, 
            hidden_layer_sizes=(25, 11, 7, 5, 3,),
            random_state=1,
            max_iter=30000
            )
        )
    ]
)

_ = text_clf_mlp.fit(x_train.text, y_train)


In [None]:
predicted_mlp = text_clf_mlp.predict(x_test.text)

print(confusion_matrix(y_test, predicted_mlp))
print(classification_report(y_test, predicted_mlp))
print(accuracy_score(y_test, predicted_mlp))

matrix = confusion_matrix(y_test, predicted_mlp, labels=list(all_categories))
print(matrix)


## GRID Search (Very time consuming)

In [None]:
# from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV as GridSearchCV


In [19]:
parameters_mlp = {
    'tfidf__use_idf': (True, False),
    'clf-mlp__lbfgs': ('lbfgs', 'sgd', 'adam'),
    'clf-mlp__alpha': (1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001),
    'clf-mlp__hidden_layer_sizes': ((15,), (5, 2), (5, 3), (5, 4), (5, 5), (5, 6), (5, 7), (5, 8), (5, 9), (5, 10), (5, 11), (5, 12), (5, 13), (5, 14), (5, 15), (5, 16), (5, 17), (5, 18), (5, 19), (5, 20), (5, 21), (5, 22), (5, 23), (5, 24), (5, 25), (5, 26), (5, 27), (5, 28), (5, 29), (5, 30), (5, 31), (5, 32), (5, 33), (5, 34), (5, 35), (5, 36), (5, 37), (5, 38), (5, 39), (5, 40), (5, 41), (5, 42), (5, 43), (5, 44), (5, 45), (5, 46), (5, 47), (5, 48), (5, 49), (5, 50), (5, 51), (5, 52), (5, 53), (5, 54), (5, 55), (5, 56), (5, 57), (5, 58), (5, 59), (5, 60), (5, 61), (5, 62), (5, 63), (5, 64), (5, 65), (5, 66), (5, 67), (5, 68), (5, 69), (5, 70), (5, 71), (5, 72), (5, 73), (5, 74), (5, 75), (5, 76), (5, 77), (5, 78), (5, 79), (5, 80), (5, 81), (5, 82), (5, 83), (5, 84), (5, 85), (5, 86), (5, 87), (5, 88), (5, 89), (5, 90), (5, 91), (5, 92), (5, 93), (5, 94), (5, 95), (5, 96), (5, 97), (5, 98), (5, 99), (5, 100), (5, 101), (25, 11, 7, 5, 3,), (25, 11, 7, 5, 3,))
}

gs_clf_knn = GridSearchCV(text_clf_mlp, parameters_mlp, n_jobs=-1, verbose=1)
gs_clf_knn = gs_clf_knn.fit(x_train.text, y_train)
print(gs_clf_knn.best_score_)
print(gs_clf_knn.best_params_)


: 

In [None]:
parameters = {
    'vect__analyzer': [stemmed_words, "word"],
    'vect__stop_words': [None, "english"],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

gs_clf = GridSearchCV(text_clf_nb, parameters, n_jobs=-1, verbose=1)
gs_clf = gs_clf.fit(x_train.text, y_train)

print(gs_clf.best_score_)
print(gs_clf.best_params_)


In [None]:
parameters_svm = {
    'tfidf__use_idf': (True, False),
    'clf-svm__alpha': (1e-2, 1e-3),
    'clf-svm__loss': ("hinge", "log_loss", "modified_huber", "squared_hinge", "squared_error", "huber", "epsilon_insensitive", "squared_epsilon_insensitive"),
    'clf-svm__penalty': ("l2", "l1", "elasticnet")
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1, verbose=1)
gs_clf_svm = gs_clf_svm.fit(x_train.text, y_train)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)


In [None]:
parameters_knn = {
    'tfidf__use_idf': (True, False),
    'clf-knn__n_neighbors': (3, 5, 7, 9, 11, 13, 15),
}
gs_clf_knn = GridSearchCV(text_clf_knn, parameters_knn, n_jobs=-1, verbose=1)
gs_clf_knn = gs_clf_knn.fit(x_train.text, y_train)
print(gs_clf_knn.best_score_)
print(gs_clf_knn.best_params_)


In [None]:
parameters_rf = {
    'tfidf__use_idf': (True, False),
    'clf-rf__alpha': (1e-2, 1e-3),
    'clf-rf__penalty': ("l2", "l1", "elasticnet"),
    'clf-rf__bootstrap': [True, False],
    'clf-rf__max_depth': [10, 20, 30, None],
}
gs_clf_rf = GridSearchCV(text_clf_rf, parameters_rf, n_jobs=-1, verbose=1)
gs_clf_rf = gs_clf_rf.fit(x_train.text, y_train)
print(gs_clf_rf.best_score_)
print(gs_clf_rf.best_params_)
