# Project for ML4B - Modelling with scikit-learn

### Tweet Class for easier usage

In [None]:
class Tweet:
    def __init__(self, text, partei):
        self.text = text
        self.partei = partei

### Load data 

Load in your data.

In [None]:
import json

file_name = './output/TwitterData.json'
tweets = []

with open(file_name ,encoding="utf8") as f:
    tweet = json.load(f)
    for partei in tweet['Tweets']:
        for text in tweet['Tweets'][partei]:
            tweets.append(Tweet(text, partei))

len(tweets)

# Prep Data

### Splitting training and testing data

In [None]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(tweets, test_size = 0.2, random_state= 40)

In [None]:
train_text = [x.text for x in training]
train_partei = [x.partei for x in training]

test_text = [x.text for x in test]
test_partei = [x.partei for x in test]


### Bags of Words Model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer()
train_text_vectors = vectorizer.fit_transform(train_text)
test_text_vectors = vectorizer.transform(test_text)


# Classification

### Decision Tree

In [None]:
import pickle

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_text_vectors, train_partei)

clf_dec.predict(test_text_vectors[0])

filename = 'decisionTree.sav'
pickle.dump(clf_dec, open(filename,'wb'))

### Logistic Regression

In [None]:
import pickle
from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression(max_iter = 100000)
clf_log.fit(train_text_vectors, train_partei)

clf_log.predict(test_text_vectors[0])

filename = 'logisticReg.sav'
pickle.dump(clf_log, open(filename,'wb'))

# Evaluation

### Mean Accuracy

In [None]:
print(clf_dec.score(test_text_vectors, test_partei))
print(clf_log.score(test_text_vectors, test_partei)) 

### F1 Score

In [None]:
from sklearn.metrics import f1_score

f1_score(test_partei, clf_dec.predict(test_text_vectors), average=None,
         labels=['AfD', 'CSU', 'Die Linke', 'CDU', 'SPD', 'FDP', 'Bündnis 90/Die Grünen'])

f1_score(test_partei, clf_log.predict(test_text_vectors), average=None,
         labels=['AfD', 'CSU', 'Die Linke', 'CDU', 'SPD', 'FDP', 'Bündnis 90/Die Grünen'])

### Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(clf_log, test_text_vectors, test_partei,normalize = 'true', ax=ax)


In [None]:
from sklearn.metrics import plot_confusion_matrix
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(12, 12))
plot_confusion_matrix(clf_dec, test_text_vectors, test_partei,normalize = 'true', ax=ax)


### GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
import pickle
parameters_log = { 'C': (1, 50), 'solver': ('sag', 'saga'), 'max_iter':(10000,100000) }
clfTuned = GridSearchCV(clf_log, parameters_log, cv=5)
clfTuned.fit(train_text_vectors, train_partei)
filename = 'clftuned.sav'
pickle.dump(clf, open(filename,'wb'))

In [None]:
print(clfTuned.score(test_text_vectors, test_partei))