In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
import csv
from datetime import datetime
import openpyxl as op

# Data import (labeled and unlabeled)

### Import labeled training and test data

In [2]:
corona_5g = pd.read_csv("~/Desktop/twitterAnalysis/FakeNews/dataset/graphs/5g_corona_conspiracy/feature_df_5g_corona_conspiracy.csv")
corona_5g['label'] = '1'
corona_5g['conspiracy'] = '1'

non_conspiracy = pd.read_csv("~/Desktop/twitterAnalysis/FakeNews/dataset/graphs/non_conspiracy/feature_df_non_conspiracy.csv")
non_conspiracy['label'] = '2'
non_conspiracy['conspiracy'] = '0'

other_conspiracy = pd.read_csv("~/Desktop/twitterAnalysis/FakeNews/dataset/graphs/other_conspiracy/feature_df_other_conspiracy.csv")
other_conspiracy['label'] = '3'
other_conspiracy['conspiracy'] = '0'

all_data = corona_5g.append(non_conspiracy)
all_data = all_data.append(other_conspiracy)

all_data = all_data.dropna()

### Split into training and test sets

In [3]:
x_unprocessed = all_data[all_data.columns[0:18]]
x = StandardScaler().fit_transform(x_unprocessed)
y = all_data[all_data.columns[18]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Import unlabeled test data

In [4]:
official_test = pd.read_csv("/Users/maria/Desktop/twitterAnalysis/FakeNews/dataset/graphs/test_graphs/feature_df_test.csv")
official_test_complete = official_test.dropna()
official_test_complete_std = StandardScaler().fit_transform(official_test_complete)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# Model training

### Default decision tree - coarse classifier

In [5]:
clf = DecisionTreeClassifier()
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['5G-conspiracy', 'non-conspiracy', 'other-conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:" + str(acc), '\n')
print("MCC: " + str(mcc), '\n')
print("Precision: " + str(prec), '\n')
print("Recall: " + str(rec), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'decision_tree_gini'
worksheet.cell(row=row_n, column=2).value = 'Decision Tree'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Multi-class'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy:0.5717391304347826 

MCC: 0.048415622101084924 

Precision: 0.3562547250237256 

Recall: 0.3561996905370399 

Label               Precision    Recall    F-score    Support
----------------  -----------  --------  ---------  ---------
5G-conspiracy       0.0943396  0.113636   0.103093         44
non-conspiracy      0.73913    0.716867   0.727829        332
other-conspiracy    0.235294   0.238095   0.236686         84


In [6]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.fillna('0', inplace=True)
labels.to_csv('decision_tree_gini.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


### Entropy (information gain) criterion  - coarse classifier

In [7]:
clf = DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['5G-conspiracy', 'non-conspiracy', 'other-conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:" + str(acc), '\n')
print("MCC: " + str(mcc), '\n')
print("Precision: " + str(prec), '\n')
print("Recall: " + str(rec), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'decision_tree_entropy'
worksheet.cell(row=row_n, column=2).value = 'Decision Tree'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Multi-class'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy:0.5673913043478261 

MCC: 0.02900872243215721 

Precision: 0.34694198288267813 

Recall: 0.34954971400754536 

Label               Precision    Recall    F-score    Support
----------------  -----------  --------  ---------  ---------
5G-conspiracy        0.12963   0.159091   0.142857         44
non-conspiracy       0.736196  0.722892   0.729483        332
other-conspiracy     0.175     0.166667   0.170732         84


In [8]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.fillna('0', inplace=True)
labels.to_csv('decision_tree_entropy.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


### Gini criterion with max depth  - coarse classifier

In [9]:
clf = DecisionTreeClassifier(max_depth=3)
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['5G-conspiracy', 'non-conspiracy', 'other-conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:" + str(acc), '\n')
print("MCC: " + str(mcc), '\n')
print("Precision: " + str(prec), '\n')
print("Recall: " + str(rec), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'decision_tree_gini_max_depth'
worksheet.cell(row=row_n, column=2).value = 'Decision Tree'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Multi-class'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy:0.7108695652173913 

MCC: -0.002483053106230638 

Precision: 0.2779830828611316 

Recall: 0.33127749091604514 

Label               Precision     Recall    F-score    Support
----------------  -----------  ---------  ---------  ---------
5G-conspiracy        0         0          0                 44
non-conspiracy       0.722838  0.981928   0.832695         332
other-conspiracy     0.111111  0.0119048  0.0215054         84


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [10]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.fillna('0', inplace=True)
labels.to_csv('decision_tree_gini_max_depth.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


### Entropy (information gain) criterion with max depth  - coarse classifier

In [11]:
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
clf = clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['5G-conspiracy', 'non-conspiracy', 'other-conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:" + str(acc), '\n')
print("MCC: " + str(mcc), '\n')
print("Precision: " + str(prec), '\n')
print("Recall: " + str(rec), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'decision_tree_entropy_max_depth'
worksheet.cell(row=row_n, column=2).value = 'Decision Tree'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Multi-class'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy:0.7108695652173913 

MCC: -0.002483053106230638 

Precision: 0.2779830828611316 

Recall: 0.33127749091604514 

Label               Precision     Recall    F-score    Support
----------------  -----------  ---------  ---------  ---------
5G-conspiracy        0         0          0                 44
non-conspiracy       0.722838  0.981928   0.832695         332
other-conspiracy     0.111111  0.0119048  0.0215054         84


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [12]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.fillna('0', inplace=True)
labels.to_csv('decision_tree_entropy_max_depth.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


# Extra Features

### Visualize decision tree

In [13]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
dot_data = StringIO()
feature_cols = list(x_unprocessed.columns.values)
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols,class_names=['no', 'yes'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('decision_tree.png')
Image(graph.create_png())

IndexError: list index out of range

### Feature importance

In [None]:
feat_importance = clf.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(feature_cols, clf.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.rename(columns = {0:'FeatureImportance'}, inplace = True)
feat_imp.sort_values(by=['FeatureImportance'], ascending=False).head()

In [None]:
# Sources

# https://www.datacamp.com/community/tutorials/decision-tree-classification-python
# https://towardsdatascience.com/decision-tree-algorithm-for-multiclass-problems-using-python-6b0ec1183bf5
