In [60]:
#let's import here relevant library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import altair as alt
from scipy.spatial.distance import cdist, pdist
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [61]:
sns.set_style('darkgrid')

In [62]:
#dataset loading
df=pd.read_csv('sample_tweets.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,author,content,region,language,publish_date,following,followers,post_type,account_type,retweet,account_category,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,0,ADNNELSTR,The empty podium says it all. The Marxists are...,Unknown,English,8/13/2017 21:00,946,83,,Right,0,RightTroll,896838937028775937,http://twitter.com/893370639309996032/statuses...,https://twitter.com/i/web/status/8968383433838...,,
1,1,ATLANTA_ONLINE,"Dust in your house may have toxic chemicals, s...",United States,English,9/15/2016 3:34,7825,17383,,local,0,NewsFeed,776262908388839424,http://twitter.com/Atlanta_Online/statuses/776...,http://on-ajc.com/2coKeyO,,
2,2,ANTONHAYHAY,singers Get 1 million soundcloud plays for $45...,United States,English,3/3/2017 20:41,303,711,RETWEET,Left,1,LeftTroll,837764807826763781,http://twitter.com/1652262638/statuses/8377648...,https://twitter.com/jayceodpromoter/status/837...,http://bit.ly/2m4KZPJ,
3,3,AMELIEBALDWIN,.@JohnsonHildy may be on to something about th...,United States,English,2/23/2017 2:05,2322,2743,RETWEET,Right,1,RightTroll,834584872111001601,http://twitter.com/1679279490/statuses/8345848...,http://www.washingtonexaminer.com/james-okeefe...,,
4,4,AMELIEBALDWIN,".@ViolaDavis just made #Oscars history, and it...",United States,English,1/25/2017 9:36,2340,2726,RETWEET,Right,1,RightTroll,824189182897754117,http://twitter.com/1679279490/statuses/8241891...,https://twitter.com/hellogiggles/status/824184...,http://trib.al/adflVAC,


In [63]:
# we remove  Unnamed: 0 variable which is irrelevant
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,author,content,region,language,publish_date,following,followers,post_type,account_type,retweet,account_category,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,ADNNELSTR,The empty podium says it all. The Marxists are...,Unknown,English,8/13/2017 21:00,946,83,,Right,0,RightTroll,896838937028775937,http://twitter.com/893370639309996032/statuses...,https://twitter.com/i/web/status/8968383433838...,,
1,ATLANTA_ONLINE,"Dust in your house may have toxic chemicals, s...",United States,English,9/15/2016 3:34,7825,17383,,local,0,NewsFeed,776262908388839424,http://twitter.com/Atlanta_Online/statuses/776...,http://on-ajc.com/2coKeyO,,
2,ANTONHAYHAY,singers Get 1 million soundcloud plays for $45...,United States,English,3/3/2017 20:41,303,711,RETWEET,Left,1,LeftTroll,837764807826763781,http://twitter.com/1652262638/statuses/8377648...,https://twitter.com/jayceodpromoter/status/837...,http://bit.ly/2m4KZPJ,
3,AMELIEBALDWIN,.@JohnsonHildy may be on to something about th...,United States,English,2/23/2017 2:05,2322,2743,RETWEET,Right,1,RightTroll,834584872111001601,http://twitter.com/1679279490/statuses/8345848...,http://www.washingtonexaminer.com/james-okeefe...,,
4,AMELIEBALDWIN,".@ViolaDavis just made #Oscars history, and it...",United States,English,1/25/2017 9:36,2340,2726,RETWEET,Right,1,RightTroll,824189182897754117,http://twitter.com/1679279490/statuses/8241891...,https://twitter.com/hellogiggles/status/824184...,http://trib.al/adflVAC,


In [64]:
#we estimate no. of unique account per author using pivot_table and remove index
df_3=df.pivot_table(values=['account_type'], index='author', aggfunc='count')
df_3.reset_index(inplace=True)

In [65]:
df_3

Unnamed: 0,author,account_type
0,10_GOP,87
1,1D_NICOLE_,9
2,1ERIK_LEE,1
3,2NDHALFONION,1
4,4EVER_SUSAN,21
...,...,...
163,ASWWIMMORRIS,181
164,ATIF_SHAIKH_ME,16
165,ATLANTA_ONLINE,1991
166,AURRLISTR,295


In [66]:
#we estimate volume of tweets per author
df_4=df.pivot_table(values=['retweet'], index='author', aggfunc=np.sum)
df_4.reset_index(inplace=True)
df_4.head()

Unnamed: 0,author,retweet
0,10_GOP,30
1,1D_NICOLE_,0
2,1ERIK_LEE,1
3,2NDHALFONION,0
4,4EVER_SUSAN,9


In [67]:
# we merge the two previous dataframes in order to answer following question
df_5=pd.merge(df_3, df_4, left_on=['author'], right_on=['author'])
df_5.head()

Unnamed: 0,author,account_type,retweet
0,10_GOP,87,30
1,1D_NICOLE_,9,0
2,1ERIK_LEE,1,1
3,2NDHALFONION,1,0
4,4EVER_SUSAN,21,9


# Answers_Question 1

In [68]:
#data of df_5 dataframe are from different scaling. Then, 
#in order to avoid impact of variance we begin by scaling them
stand=StandardScaler()
X=df_5[['retweet', 'account_type']]
X=stand.fit_transform(X)

In [69]:
#we use metric of compactness silhouette-score, to have first idea about relevant number of clusters
for k in range(2, 15):
    k_means=KMeans(n_clusters=k, max_iter=300)
    k_means.fit(X)
    print('for K value',k, ', silhouette-score: %0.3f' % silhouette_score(X, k_means.labels_, metric='euclidean'))

for K value 2 , silhouette-score: 0.954
for K value 3 , silhouette-score: 0.894
for K value 4 , silhouette-score: 0.794
for K value 5 , silhouette-score: 0.790
for K value 6 , silhouette-score: 0.761
for K value 7 , silhouette-score: 0.691
for K value 8 , silhouette-score: 0.686
for K value 9 , silhouette-score: 0.686
for K value 10 , silhouette-score: 0.687
for K value 11 , silhouette-score: 0.699
for K value 12 , silhouette-score: 0.699
for K value 13 , silhouette-score: 0.703
for K value 14 , silhouette-score: 0.700


In [70]:
#we estimate inertia per number of clusters in order to estimate the number corresponding 
#to significant decreasing of inertia
clusters=pd.DataFrame()
clusters['clus']=range(1,15)
inert=[]
for k in clusters['clus']:
    k_means=KMeans(n_clusters=k, random_state=8).fit(X)
    inert.append(k_means.inertia_)


In [71]:
clusters['inertia']=inert

In [72]:
#visualization with Chart graph
alt.Chart(clusters).mark_line().encode(x='clus', y='inertia')

#### From elbow method and silhouette-score of compacity, we deduce that optimal number of clusters is 3

In [73]:
#we fit kmeans with the optimal number of clusters
kmeans=KMeans(random_state=8, n_clusters=3, init='k-means++')
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=3, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=8, tol=0.0001, verbose=0)

In [74]:
#we estimate for each instance of dataset the corresponding cluster or subgroup
df_5['clusters']=kmeans.predict(X)

In [75]:
alt.Chart(df_5).mark_circle().encode(x='account_type', y='retweet', color='clusters:N', tooltip=['author', 'clusters']).interactive()

# Answers_Question 2

In [76]:
#we transform publish_date type variable into date type, then we extract years and create new columns for them.
#we print different values of year in order to see the most old one
df['year']=pd.to_datetime(df['publish_date'])
df['year']=df['year'].dt.year
df['year'].unique()

array([2017, 2016, 2015, 2014, 2018])

In [77]:
#we print different dates corresponding to the most old year and deduce the most old date
df[df['year']==2014]['publish_date'].unique()

array(['12/1/2014 13:22', '11/28/2014 9:56', '11/28/2014 10:20',
       '11/28/2014 10:28', '11/28/2014 9:49', '11/28/2014 9:20',
       '11/28/2014 9:41', '12/9/2014 9:22', '11/28/2014 10:00',
       '11/28/2014 10:11', '11/28/2014 10:15', '12/6/2014 9:02',
       '12/9/2014 12:05', '12/1/2014 13:09', '11/28/2014 10:19',
       '11/28/2014 10:02', '11/28/2014 9:06', '11/28/2014 9:22',
       '11/28/2014 9:36', '11/28/2014 9:19', '12/9/2014 12:07',
       '12/4/2014 7:21', '12/9/2014 12:16', '11/28/2014 9:31',
       '12/16/2014 8:29', '12/1/2014 13:19', '11/28/2014 9:17',
       '12/8/2014 9:26', '11/27/2014 17:21', '12/9/2014 12:04',
       '12/4/2014 7:22', '11/28/2014 9:16', '11/28/2014 10:07'],
      dtype=object)

In [78]:
#we print author, account_type and account_category corresponding to the old date
df[df['publish_date']=='11/27/2014 17:21'][['author', 'account_type', 'account_category']]

Unnamed: 0,author,account_type,account_category
42203,ABIGAILSSILK,Hashtager,HashtagGamer


In [80]:
#we estimate total volume of tweet per date using groupby and get it in descending order
df.groupby(['publish_date'])['retweet'].agg(no=np.sum).reset_index().sort_values('no', ascending=False)

Unnamed: 0,publish_date,no
4872,11/16/2016 16:11,12
20102,7/21/2015 19:12,11
20248,7/22/2015 17:32,10
20492,7/23/2015 8:11,9
20088,7/21/2015 17:32,9
...,...,...
15603,5/1/2017 14:08,0
15604,5/1/2017 15:22,0
15605,5/1/2017 15:23,0
15606,5/1/2017 15:25,0


# Answers_Question 3

In [81]:
#we print dataframe for a particular value of content variable
df[df.content=='I am here for a purpose and that purpose is to grow into a mountain, not to shrink to a grain of sand. - Mandino #quote via @roxanamjones']['author']

31822    AMELIEBALDWIN
Name: author, dtype: object

In [82]:
#we print dataframe for a particular value of author variable and extract corresponding values of content variable.
df[df.author=='AMELIEBALDWIN']['content'].tolist()

['.@JohnsonHildy may be on to something about the next @JamesOKeefeIII release. Or maybe not. https://t.co/yeCTqGabrf',
 '.@ViolaDavis just made #Oscars history, and it’s for an incredible reason https://t.co/F7wuktYQcD https://t.co/9UKC6joQxa',
 'Still waiting for Coyne to call Trudeau a liar https://t.co/lxsjjmdIgO',
 'Donors Demand Accountng of $1.2BL (Hey #Dems-Dont Win w/Just #Money Need Integrity,Policys4People,Vision,Competence) #politico #Salon #Yahoo https://t.co/WEMCFTU33W',
 "Clinton: Half of Trump Supporters Are in ‘Basket of Deplorables'   —Hillary Clinton told an audience of donors... https://t.co/Wdi2XI3828",
 'Learn from yesterday, live for day, hope for tomorrow. The important thing is not to stop questioning. -Albert Einstein #success',
 'Dear journalists: Please stop treating former Bush NSA and CIA chief Michael Hayden as some sort of arbiter of truth https://t.co/8Bnt9UuPgu',
 'US #Government Now Using Trolls To Attack #Conspiracy Theorists Websites, https://t.co/i

# Answers_Question 4

In [83]:
#we estimate daily number of tweets per category using pivot_table
df.groupby(['publish_date', 'account_category'])['retweet'].agg(no=np.sum).reset_index().sort_values('no', ascending=False)

Unnamed: 0,publish_date,account_category,no
4948,11/16/2016 16:11,HashtagGamer,12
20343,7/21/2015 19:12,RightTroll,11
20489,7/22/2015 17:32,RightTroll,10
20733,7/23/2015 8:11,RightTroll,9
27617,8/8/2017 8:00,RightTroll,9
...,...,...,...
15656,4/8/2017 12:44,NewsFeed,0
15663,4/8/2017 14:44,NewsFeed,0
15673,4/8/2017 17:44,NewsFeed,0
15681,4/8/2017 1:44,NewsFeed,0


In [84]:
#we estimate total number of tweets per category using pivot_table
df_2=df.pivot_table(values=['retweet'], index='account_category', aggfunc=np.sum)
df_2.reset_index(inplace=True)

In [85]:
#we use Chart plot to rank category of account based on volume of tweets
alt.Chart(df_2).mark_bar().encode(x='retweet', y='account_category', color='account_category', tooltip=['retweet']).interactive()

# Answers_Question 5

## 5.1/ When identifying  fake or suspicious accounts on twitter, there are three most significant factors to consider: Activity, Anonimity, amplification.

### Indeed, fake Twitter account are characterized by:
### -Hyperactivity (huge ratio of number of post by number of days in activity)
### -less personal information
### -huge number of retweets

In [86]:
#This code tries to identify account with huge size activity
for i in df['account_category'].unique():
    H=df[df['account_category']==i]['retweet'].sum()/len(df[df['account_category']==i]['publish_date'].unique())
    if H >= 1:
        print(i)

LeftTroll


In [87]:
H=df[df['account_category']=='LeftTroll']
H['post_type'].value_counts()

RETWEET        7094
QUOTE_TWEET     195
Name: post_type, dtype: int64

In [88]:
df['post_type'].unique()

array([nan, 'RETWEET', 'QUOTE_TWEET'], dtype=object)

## 5.2/ 

### -From code above, LeftTroll is the only account with benchmark ratio over 1
### -Moreover it has no personal information
### -less personal information
### -More Retweet alike

### LefTroll is then a candidate for fake account