In [45]:
import pandas as pd
import os
import re
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from collections import Counter
import warnings

In [47]:
folder_path = r'C:\Users\XPS\Desktop\DS5230\Project\unsupervisedfinalproj\russian_troll_dataset'

# Loading the dataset
warnings.filterwarnings("ignore")
file_list = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
df_list = []
for file_name in file_list:
    file_path = os.path.join(folder_path, file_name)
    df = pd.read_csv(file_path)
    df_list.append(df)

# Combining all DataFrames into one
data = pd.concat(df_list, ignore_index=True)
data.head()

Unnamed: 0,external_author_id,author,content,region,language,publish_date,harvested_date,following,followers,updates,...,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
0,906000000000000000,10_GOP,"""We have a sitting Democrat US Senator on tria...",Unknown,English,10/1/2017 19:58,10/1/2017 19:59,1052,9636,253,...,Right,0,RightTroll,0,905874659358453760,914580356430536707,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914580356430...,,
1,906000000000000000,10_GOP,Marshawn Lynch arrives to game in anti-Trump s...,Unknown,English,10/1/2017 22:43,10/1/2017 22:43,1054,9637,254,...,Right,0,RightTroll,0,905874659358453760,914621840496189440,http://twitter.com/905874659358453760/statuses...,https://twitter.com/damienwoody/status/9145685...,,
2,906000000000000000,10_GOP,Daughter of fallen Navy Sailor delivers powerf...,Unknown,English,10/1/2017 22:50,10/1/2017 22:51,1054,9637,255,...,Right,1,RightTroll,0,905874659358453760,914623490375979008,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/913231923715...,,
3,906000000000000000,10_GOP,JUST IN: President Trump dedicates Presidents ...,Unknown,English,10/1/2017 23:52,10/1/2017 23:52,1062,9642,256,...,Right,0,RightTroll,0,905874659358453760,914639143690555392,http://twitter.com/905874659358453760/statuses...,https://twitter.com/10_gop/status/914639143690...,,
4,906000000000000000,10_GOP,"19,000 RESPECTING our National Anthem! #StandF...",Unknown,English,10/1/2017 2:13,10/1/2017 2:13,1050,9645,246,...,Right,1,RightTroll,0,905874659358453760,914312219952861184,http://twitter.com/905874659358453760/statuses...,https://twitter.com/realDonaldTrump/status/914...,,


In [49]:
# Function to extract Hashtags from text
def extract_hashtags(text):
    return re.findall(r'#\w+', str(text))

# Function to extract Mentions from text
def extract_mentions(text):
    return re.findall(r'@\w+', str(text))

# Function to extract URLs from text
def extract_urls(text):
    return re.findall(r'http[s]?://\S+|www\.\S+', str(text))

# We apply the extraction functions to the 'content' column of the dataset 
data['hashtags'] = data['content'].apply(extract_hashtags)
data['mentions'] = data['content'].apply(extract_mentions)
data['urls'] = data['content'].apply(extract_urls)

# Now, we combine hashtags, mentions, and URLs into one list for each tweet
data['items'] = data['hashtags'] + data['mentions'] + data['urls']

In [51]:
# One-Hot Encoding for Apriori 
transactions = data['items'].apply(lambda x: [item.lower() for item in x]).tolist()

# Here, we noticed that due to the large size of the dataset and high number or unique items in the dataset we encountered a MemoryError
# Thus, to resolve this issue, we're using sparse matrix
te = TransactionEncoder()
te_fit = te.fit(transactions).transform(transactions, sparse = True)

In [None]:
from scipy.sparse import csr_matrix

# Now, we convert the sparse matrix to a pandas DataFrame, while ensuring boolean format so it is supported by the apriori algorithm
df_sparse = pd.DataFrame.sparse.from_spmatrix(te_fit, columns=te.columns_)

In [25]:
# Now, we apply the Apriori algorithm to the sparse matrix to get frequent itemsets
# For Apriori, set low_memory = true to process the data in chunks to avoid MemoryError
frequent_itemsets = apriori(df_sparse, min_support=0.0005, use_colnames=True, low_memory=True)
frequent_itemsets.head()

Unnamed: 0,support,itemsets
0,0.000925,(#2016in4words)
1,0.001537,(#2a)
2,0.000588,(#aleppo)
3,0.000763,(#alternativeacronyminterpretations)
4,0.000621,(#amb)
5,0.000689,(#art)
6,0.000606,(#betteralternativetodebates)
7,0.004918,(#blacklivesmatter)
8,0.000555,(#blackskinisnotacrime)
9,0.001155,(#blacktwitter)


In [27]:
# Finally, we generate the association rules from the frequent itemsets
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.5)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))

     antecedents    consequents   support  confidence        lift
0        (#tcot)          (#2a)  0.000551    0.103521   67.357304
1          (#2a)        (#tcot)  0.000551    0.358436   67.357304
2    (#brussels)  (#islamkills)  0.000611    0.816985  589.086057
3  (#islamkills)    (#brussels)  0.000611    0.440284  589.086057
4    (#business)        (#news)  0.000584    0.157663    3.544762
5        (#news)    (#business)  0.000584    0.013141    3.544762
6       (#pjnet)        (#ccot)  0.000724    0.160593   80.727019
7        (#ccot)       (#pjnet)  0.000724    0.363931   80.727019
8        (#tcot)        (#ccot)  0.001372    0.257813  129.597667
9        (#ccot)        (#tcot)  0.001372    0.689643  129.597667


In [43]:
rules_sorted_by_conf = rules.sort_values(by='confidence', ascending=False)
print(rules_sorted_by_conf[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(20))

           antecedents         consequents   support  confidence         lift
22     (#tagderjugend)  (#jugendmitmerkel)  0.000569    0.998214  1537.347578
42            (#world)             (#news)  0.009335    0.995548    22.382987
53           (#тюмень)          (#новости)  0.000523    0.974684   268.375652
30         (#sandiego)            (#local)  0.000792    0.953431   107.661883
23  (#jugendmitmerkel)     (#tagderjugend)  0.000569    0.876634  1537.347578
19       (#stopthegop)        (#gopdebate)  0.000558    0.849174   626.399867
2          (#brussels)       (#islamkills)  0.000611    0.816985   589.086057
21        (#stopislam)       (#islamkills)  0.000502    0.795699   573.738067
56     (#pjnet, #ccot)             (#tcot)  0.000559    0.771683   145.014548
17        (#demdebate)       (#demndebate)  0.000900    0.749293   535.428916
27            (#miami)            (#local)  0.001142    0.731045    82.549914
48         (#политика)          (#новости)  0.000531    0.699151

In [41]:
frequent_itemsets.to_csv('frequent_itemsets_supp0005.csv', index=False)
rules_sorted_by_conf.to_csv('association_rules_lift15.csv', index=False)

In [None]:
# If we want unidirectional relations we can do so by removing duplicate rules obtained in the above result set
rules_copy = rules.copy()
rules_copy['antecedents'] = rules_copy['antecedents'].apply(lambda x: frozenset(x))
rules_copy['consequents'] = rules_copy['consequents'].apply(lambda x: frozenset(x))

# Adding a new column that combines both antecedent and consequent, ordered alphabetically
rules_copy['rule'] = rules_copy.apply(lambda row: frozenset([*row['antecedents'], *row['consequents']]), axis=1)
rules_copy_unique = rules_copy.drop_duplicates(subset='rule')

rules_sorted = rules_copy_unique.sort_values(by='lift', ascending=False)
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(10))