In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
import joblib
import os

In [2]:
datapath = '/home/dafne/shared/FilterBubble/topic-modeling/Felicia-Archive/'

In [3]:
df = pd.read_pickle(os.path.join(datapath, 'classifier_data.pkl'))

In [4]:
df = df[df.v9_major_rec != " "]
y = df.v9_major_rec

In [5]:
len(df)

11124

In [6]:
recode = {'Binnenland':['13','14','20', '3', '4', '5', '6'], 'Buitenland':['16', '19', '2'], 'Economie':['1','15'], 'Milieu':['8', '7'],  'Wetenschap':['17'], 'Immigratie':['9'],  'Justitie':['12'], 'Sport':['29'], 'Entertainment':['23'], 'Anders':['10','99']}

In [7]:
def recode_topics(number):
    for key, value in recode.items():
        if number in value:
            result = key
    return result

In [8]:
df['topic'] = df['v9_major_rec'].apply(recode_topics)

In [9]:
df['topic'].value_counts()

Binnenland       2500
Buitenland       1831
Anders           1670
Justitie         1201
Entertainment    1043
Economie         1036
Sport            1029
Wetenschap        427
Milieu            235
Immigratie        152
Name: topic, dtype: int64

In [10]:
df = df[df.topic != " "]
y = df.topic
df = df.drop('topic', axis = 1)

In [11]:
df['Processed_text']=[" ".join(text) for text in df['Processed_text'].values]

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df['Processed_text'], y, test_size=0.2)

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
with open('vectorizer.pkl', 'wb') as fin:
    joblib.dump(tfidf_vectorizer, fin)


In [14]:
tfidf_test = tfidf_vectorizer.transform(X_test)

In [15]:
linear_clf = PassiveAggressiveClassifier(n_iter=50)

TypeError: __init__() got an unexpected keyword argument 'n_iter'

In [None]:
linear_clf.fit(tfidf_train, y_train)

In [None]:
pred = linear_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

In [None]:
a = classification_report(y_test, pred, target_names = ['Binnenland', 'Buitenland', 'Economie', 'Milieu', 'Wetenschap', 'Justitie', 'Immigratie', 'Sport', 'Entertainment', 'Anders'])

In [None]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.utils.multiclass import unique_labels
def classification_report_pandas(ground_truth,
                                            predictions):
    """
    Saves the classification report to csv using the pandas module.
    :param ground_truth: list: the true labels
    :param predictions: list: the predicted labels
    """
    labels = unique_labels(ground_truth, predictions)

    precision, recall, f_score, support = precision_recall_fscore_support(ground_truth,
                                                                          predictions,
                                                                          labels=labels,
                                                                          average=None)
    results_pd = pd.DataFrame({"topic": labels,
                               "f_score": f_score,
                               'precision': precision,
                               'recall':recall,
                               })
    return results_pd

In [None]:
df2 = classification_report_pandas(y_test, pred)

In [None]:
b = y.value_counts()

In [None]:
b = b.to_frame()

In [None]:
b['index'] = b.index

In [None]:
final = pd.merge(df2, b, left_on='topic', right_on='index')
final = final.drop('index', axis = 1)
final.rename(columns = {'topic_y':'number', 'topic_x':'topic'}, inplace = True)

In [None]:
final

In [85]:
final = final.drop(final[final.topic == 'Milieu'].index)

In [86]:
final['f_score'].mean()

0.6674526881343473

In [87]:
with open('topic_classifier.pkl', 'wb') as fid:
    joblib.dump(linear_clf, fid)  