In [4]:
import pandas as pd
import os
from urllib.parse import unquote
import datetime

os.chdir("/home/gabri/Desktop/ADA/ADA_wikispeedia")
print(os.getcwd())

SAVE_FOLDER = "data_processed"

/home/gabri/Desktop/ADA/ADA_wikispeedia


# Article Name tsv

In [5]:
df_names_path = "data/wikispeedia_paths-and-graph/articles.tsv"
df_names = pd.read_csv(df_names_path, sep='\t', header=None, comment='#')

# name a colum
df_names = df_names.rename(columns={0:"article_name"})

# Decode the article name
df_names.article_name = df_names.article_name.apply(unquote)

# Use " " instead of "_"
df_names.article_name = df_names.article_name.str.replace('_', ' ', regex=False)

#  Reset row index
df_names.reset_index(drop=True, inplace=True)

df_names

# Save processed df
df_names.to_csv(os.path.join(SAVE_FOLDER, "articles_processed.csv"), index = False)

# Article Category tsv

In [34]:
df_categories_path = "data/wikispeedia_paths-and-graph/categories.tsv"
df_categories = pd.read_csv(df_categories_path, sep='\t', header=None, comment='#')

# name a colum
df_categories = df_categories.rename(columns={0:"article_name", 1:"article_category"})

# Decode the article name
df_categories.article_name = df_categories.article_name.apply(unquote)

# Use " " instead of "_"
df_categories.article_name = df_categories.article_name.str.replace('_', ' ', regex=False)

# Use " " instead of "_"
df_categories.article_category = df_categories.article_category.str.replace('_', ' ', regex=False)

# Split the article_category into a list of categories
df_categories['article_category_list'] = df_categories['article_category'].str.split('.')

# Remove 'subject' from each list
df_categories['article_category_list'] = df_categories['article_category_list'].apply(lambda x: [cat for cat in x if cat.strip() != "subject"])

# Create a col for each cat
category_lists = df_categories['article_category_list']
category_df = pd.DataFrame(category_lists.tolist())
category_df.columns = [f'article_category_{i+1}' for i in range(category_df.shape[1])]
df_categories = pd.concat([df_categories, category_df], axis=1)


# HOW TO DEAL WITH ARTICLE THAT HAVE MORE CATEGORIES???
# ??????????????????????''
# ??????????????????????


#  Reset row index
df_categories.reset_index(drop=True, inplace=True)

df_categories

Unnamed: 0,article_name,article_category,article_category_list,article_category_1,article_category_2,article_category_3
0,Áedán mac Gabráin,subject.History.British History.British Histor...,"[History, British History, British History 150...",History,British History,British History 1500 and before including Roma...
1,Áedán mac Gabráin,subject.People.Historical figures,"[People, Historical figures]",People,Historical figures,
2,Åland,subject.Countries,[Countries],Countries,,
3,Åland,subject.Geography.European Geography.European ...,"[Geography, European Geography, European Count...",Geography,European Geography,European Countries
4,Édouard Manet,subject.People.Artists,"[People, Artists]",People,Artists,
...,...,...,...,...,...,...
5199,Zirconium,subject.Science.Chemistry.Chemical elements,"[Science, Chemistry, Chemical elements]",Science,Chemistry,Chemical elements
5200,Zoroaster,subject.People.Religious figures and leaders,"[People, Religious figures and leaders]",People,Religious figures and leaders,
5201,Zuid-Gelders,subject.Geography.European Geography,"[Geography, European Geography]",Geography,European Geography,
5202,Zuid-Gelders,subject.Language and literature.Languages,"[Language and literature, Languages]",Language and literature,Languages,


# Article links relationship

In [36]:
df_links_path = "data/wikispeedia_paths-and-graph/links.tsv"
df_links = pd.read_csv(df_links_path, sep='\t', header=None, comment='#')

# name a colum
df_links = df_links.rename(columns={0:"link_source", 1:"link_target"})

# Decode the article name and Use " " instead of "_"
df_links.link_source = df_links.link_source.apply(unquote)
df_links.link_source = df_links.link_source.str.replace('_', ' ', regex=False)

df_links.link_target = df_links.link_target.apply(unquote)
df_links.link_target = df_links.link_target.str.replace('_', ' ', regex=False)

df_links


Unnamed: 0,link_source,link_target
0,Áedán mac Gabráin,Bede
1,Áedán mac Gabráin,Columba
2,Áedán mac Gabráin,Dál Riata
3,Áedán mac Gabráin,Great Britain
4,Áedán mac Gabráin,Ireland
...,...,...
119877,Zulu,South Africa
119878,Zulu,Swaziland
119879,Zulu,United Kingdom
119880,Zulu,Zambia


# Path Finished


In [11]:
df_pf_path = "data/wikispeedia_paths-and-graph/paths_finished.tsv"
df_pf = pd.read_csv(df_pf_path, sep='\t', header=None, comment='#')

# name a colum
df_pf = df_pf.rename(columns={0:"hashed_ip", 1:"timestamp", 2:"duration", 3:"path", 4:"rating"})

# create a list of liks for rach path
df_pf['path_list'] = df_pf['path'].str.split(';')

# for each element of the path: 1) decode, 2) substitue "_" with " "
df_pf['path_list'] = df_pf['path_list'].apply(lambda links: [unquote(link.replace('_', ' ')) for link in links])

# number cliks
df_pf['n_click'] = df_pf['path_list'].apply(len)

# source link and target link
df_pf['source_link'] = df_pf['path_list'].apply(lambda x: x[0] if len(x) > 0 else None)  # First element
df_pf['target_link'] = df_pf['path_list'].apply(lambda x: x[-1] if len(x) > 0 else None)  # Last element

# Count occurrences of "<" in each list of path_list
df_pf['n_back'] = df_pf['path_list'].apply(lambda x: len([el for el in x if el == "<"]))

#convert timestamp into date
df_pf['date'] = df_pf['timestamp'].apply(datetime.datetime.fromtimestamp)

# colum for finished or not
df_pf["finished"] = True

# column for cause fo unfinisehd in case
df_pf["type_unfinished"] = None

# click rate
df_pf['click_rate'] = df_pf['n_click']/df_pf['duration']
df_pf['normalized_duration'] = df_pf['duration'] / df_pf['n_click']

# freq back click
df_pf['freq_back'] = df_pf['n_back']/df_pf['n_click']


df_pf.head(5)

Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,n_click,source_link,target_link,n_back,date,finished,type_unfinished,click_rate,freq_back,normalized_duration
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",9,14th century,African slave trade,0,2011-02-15 04:26:49,True,,0.054217,0.0,18.444444
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",5,14th century,African slave trade,0,2012-08-12 08:36:52,True,,0.056818,0.0,17.6
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[14th century, Niger, Nigeria, British Empire,...",8,14th century,African slave trade,0,2012-10-03 23:10:40,True,,0.057971,0.0,17.25
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[14th century, Renaissance, Ancient Greece, Gr...",4,14th century,Greece,0,2010-02-08 08:25:25,True,,0.108108,0.0,9.25
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[14th century, Italy, Roman Catholic Church, H...",7,14th century,John F. Kennedy,0,2013-04-23 17:27:08,True,,0.04,0.0,25.0
