In [1]:
import pandas as pd
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from tabulate import tabulate
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import seaborn as sns
import csv
from datetime import datetime
import openpyxl as op

# Data import (labeled and unlabeled)

### Import data and add labels

In [2]:
corona_5g = pd.read_csv("~/Desktop/twitterAnalysis/FakeNews/dataset/graphs/5g_corona_conspiracy/feature_df_5g_corona_conspiracy.csv")
corona_5g['label'] = 'corona_5g'
corona_5g['conspiracy'] = '1'

non_conspiracy = pd.read_csv("~/Desktop/twitterAnalysis/FakeNews/dataset/graphs/non_conspiracy/feature_df_non_conspiracy.csv")
non_conspiracy['label'] = 'non_conspiracy'
non_conspiracy['conspiracy'] = '0'

other_conspiracy = pd.read_csv("~/Desktop/twitterAnalysis/FakeNews/dataset/graphs/other_conspiracy/feature_df_other_conspiracy.csv")
other_conspiracy['label'] = 'other_conspiracy'
other_conspiracy['conspiracy'] = '0'

all_data = corona_5g.append(non_conspiracy)
all_data = all_data.append(other_conspiracy)

all_data = all_data.dropna()

### Split into train and test sets

In [3]:
x_unprocessed = all_data[all_data.columns[0:18]]
x = StandardScaler().fit_transform(x_unprocessed)
y = all_data[all_data.columns[19]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


### Import unlabeled test data

In [4]:
official_test = pd.read_csv("/Users/maria/Desktop/twitterAnalysis/FakeNews/dataset/graphs/test_graphs/feature_df_test.csv")
official_test_complete = official_test.dropna()
official_test_complete_std = StandardScaler().fit_transform(official_test_complete)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


# Model training

### Run LDA with cross-fold validation using SVD

In [5]:
clf = LinearDiscriminantAnalysis()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['non-conspiracy', 'conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:",metrics.accuracy_score(y_test, y_pred), '\n')
print("MCC: " + str(mcc), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'LDA_SVD_coarse'
worksheet.cell(row=row_n, column=2).value = 'LDA'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Coarse'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy: 0.8956521739130435 

MCC: 0.06981698999765348 

Label             Precision     Recall    F-score    Support
--------------  -----------  ---------  ---------  ---------
non-conspiracy      0.90708  0.985577   0.9447           416
conspiracy          0.25     0.0454545  0.0769231         44




In [6]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.to_csv('LDA_SVD_coarse.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Run LDA with cross-fold validation using lsqr

In [7]:
clf = LinearDiscriminantAnalysis(solver='lsqr')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['non-conspiracy', 'conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:",metrics.accuracy_score(y_test, y_pred), '\n')
print("MCC: " + str(mcc), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'LDA_LSQR_coarse'
worksheet.cell(row=row_n, column=2).value = 'LDA'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Coarse'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy: 0.8956521739130435 

MCC: 0.06981698999765348 

Label             Precision     Recall    F-score    Support
--------------  -----------  ---------  ---------  ---------
non-conspiracy      0.90708  0.985577   0.9447           416
conspiracy          0.25     0.0454545  0.0769231         44


In [8]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.to_csv('LDA_LSQR_coarse.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Run LDA with cross-fold validation using lsqr and auto shrinkage

In [9]:
clf = LinearDiscriminantAnalysis(solver='lsqr', shrinkage='auto')
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

precision, recall, fscore, support = metrics.precision_recall_fscore_support(y_test, y_pred)
labels = ['non-conspiracy', 'conspiracy']
acc = metrics.accuracy_score(y_test, y_pred)
mcc = metrics.matthews_corrcoef(y_test, y_pred)
prec = metrics.precision_score(y_test, y_pred, average='macro')
rec = metrics.recall_score(y_test, y_pred, average='macro')
print("Overall accuracy:",metrics.accuracy_score(y_test, y_pred), '\n')
print("MCC: " + str(mcc), '\n')
print(tabulate({"Label": labels,
               "Precision": precision,
               "Recall": recall,
               "F-score": fscore,
               "Support": support}, headers="keys"))
workbook = op.load_workbook("/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx")
worksheet = workbook.worksheets[0]
row_n = worksheet.max_row+1
worksheet.cell(row=row_n, column=1).value = 'LDA_LSQR_shrinkage_coarse'
worksheet.cell(row=row_n, column=2).value = 'LDA'
worksheet.cell(row=row_n, column=3).value = 'Structural Data'
worksheet.cell(row=row_n, column=4).value = 'Coarse'
worksheet.cell(row=row_n, column=5).value = mcc
worksheet.cell(row=row_n, column=6).value = acc
worksheet.cell(row=row_n, column=7).value = prec
worksheet.cell(row=row_n, column=8).value = rec
worksheet.cell(row=row_n, column=9).value = datetime.now()
workbook.save('/Users/maria/Desktop/twitterAnalysis/FakeNews/scripts/graphs/model_summary.xlsx')

Overall accuracy: 0.8978260869565218 

MCC: 0.037191829108863705 

Label             Precision     Recall    F-score    Support
--------------  -----------  ---------  ---------  ---------
non-conspiracy     0.905495  0.990385   0.946039         416
conspiracy         0.2       0.0227273  0.0408163         44


In [10]:
official_test_complete['label'] = clf.predict(official_test_complete_std)
official_test_all = pd.concat([official_test, official_test_complete], axis=1)
labels = official_test_all['label']
labels.to_csv('LDA_LSQR_shrinkage_coarse.csv', header=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [11]:
# sources 

# https://scikit-learn.org/stable/modules/generated/sklearn.discriminant_analysis.LinearDiscriminantAnalysis.html