In [None]:
import pandas as pd
from datetime import datetime
import glob
import os
from matplotlib import pyplot as plt
from scipy import stats

In [None]:
path = r'/Users/francoismizrahi/Documents/LBS/Courses/London Lab/data/clean_tweets/'
all_files = glob.glob(os.path.join(path, "*.csv"))

df = pd.concat((pd.read_csv(f) for f in all_files))

In [None]:
df.drop_duplicates(subset=['Tweet Id'], inplace=True)
df.sort_values(by=['Datetime'], inplace = True)
df = df.reset_index().drop(columns=["index", "Unnamed: 0", "Unnamed: 0.1"])
df['date'] = pd.to_datetime(df['Datetime']).dt.date
df['week'] = pd.to_datetime(df['Datetime']).dt.isocalendar().week
df['month'] = pd.to_datetime(df['Datetime']).dt.month
df['Text'] = df['Text'].astype('str')

In [None]:
len(df)

In [None]:
df.head()

In [None]:
df_group = df.groupby(['date']).mean()

In [None]:
plt.figure(figsize=(15, 10))
plt.plot(df_group.score, marker='o', alpha=0.4)

In [None]:
rolling = df_group.rolling(30).mean().score[30:]

In [None]:
plt.figure(figsize=(15, 10))
plt.plot(rolling, marker='o', alpha=0.4)
plt.axvspan(datetime.strptime("2018-03-01", '%Y-%m-%d').date(),datetime.strptime("2018-04-01", '%Y-%m-%d').date(), color= "red", alpha=0.2)


In [None]:
df_months_before = df[(df["date"] < datetime.strptime("2018-03-01", '%Y-%m-%d').date())]
df_months_after = df[(df["date"] >= datetime.strptime("2018-03-01", '%Y-%m-%d').date()) & (df["date"] < datetime.strptime("2018-04-01", '%Y-%m-%d').date())]


In [None]:
df_months_before.score.mean()

In [None]:
df_months_after.score.mean()

In [None]:
(df_months_after.score.mean()-df_months_before.score.mean())/df_months_before.score.mean()*100

In [None]:
stat, p = stats.ttest_ind(df_months_after.score, df_months_before.score)

In [None]:
print(f"p-value of: {p}")
print(f"t-stat of: {stat}")

## Cambridge Analytica

In [None]:
plt.figure(figsize=(15, 10))
df.groupby(df["week"]).count().month.plot(kind="bar", alpha=0.4)

In [None]:
plt.figure(figsize=(15, 10))
df.groupby(df[df['Text'].str.contains('Cambridge Analytica', case=False)]["week"]).count().month.plot(kind="bar", alpha=0.4)


In [None]:
len(df[df['Text'].str.contains('Cambridge Analytica', case=False)])/len(df)*100

In [None]:
# Names of group and bar width
barWidth = 1
plt.figure(figsize=(15, 10))

r = list(range(1,32))
bar1 = df.groupby(df["week"]).count().month
bar2 = df.groupby(df[df['Text'].str.contains('Cambridge Analytica', case=False)]["week"]).count().month
# Create brown bars
plt.bar(r, bar1, color='blue', edgecolor='white', width=barWidth, alpha=0.4)
# Create green bars (middle), on top of the first ones
plt.bar(r, bar2, color='red', edgecolor='white', width=barWidth)
 
# Custom X axis
plt.xlabel("group")
 
# Show graphic
plt.show()