In [2]:
import pandas as pd
import os
from urllib.parse import unquote
import datetime

os.chdir("/home/gabri/Desktop/ADA/ADA_wikispeedia")
print(os.getcwd())

SAVE_FOLDER = "data_processed"

/home/gabri/Desktop/ADA/ADA_wikispeedia


# Article Name tsv

In [4]:
df_names_path = "data/wikispeedia_paths-and-graph/articles.tsv"
df_names = pd.read_csv(df_names_path, sep='\t', header=None, comment='#')

# name a colum
df_names = df_names.rename(columns={0:"article_name"})

# Decode the article name
df_names.article_name = df_names.article_name.apply(unquote)

# Use " " instead of "_"
df_names.article_name = df_names.article_name.str.replace('_', ' ', regex=False)

#  Reset row index
df_names.reset_index(drop=True, inplace=True)

# Save processed df
df_names.to_csv(os.path.join(SAVE_FOLDER, "articles_processed.csv"), index = False)

df_names

Unnamed: 0,article_name
0,Áedán mac Gabráin
1,Åland
2,Édouard Manet
3,Éire
4,Óengus I of the Picts
...,...
4599,Zionism
4600,Zirconium
4601,Zoroaster
4602,Zuid-Gelders


# Article Category tsv

In [5]:
df_categories_path = "data/wikispeedia_paths-and-graph/categories.tsv"
df_categories = pd.read_csv(df_categories_path, sep='\t', header=None, comment='#')

# name a colum
df_categories = df_categories.rename(columns={0:"article_name", 1:"article_category"})

# Decode the article name
df_categories.article_name = df_categories.article_name.apply(unquote)

# Use " " instead of "_"
df_categories.article_name = df_categories.article_name.str.replace('_', ' ', regex=False)

# Use " " instead of "_"
df_categories.article_category = df_categories.article_category.str.replace('_', ' ', regex=False)

# Split the article_category into a list of categories
df_categories['article_category_list'] = df_categories['article_category'].str.split('.')

# Remove 'subject' from each list
df_categories['article_category_list'] = df_categories['article_category_list'].apply(lambda x: [cat for cat in x if cat.strip() != "subject"])

# Create a col for each cat
category_lists = df_categories['article_category_list']
category_df = pd.DataFrame(category_lists.tolist())
category_df.columns = [f'article_category_{i+1}' for i in range(category_df.shape[1])]
df_categories = pd.concat([df_categories, category_df], axis=1)


# HOW TO DEAL WITH ARTICLE THAT HAVE MORE CATEGORIES???
# ??????????????????????''
# ??????????????????????


#  Reset row index
df_categories.reset_index(drop=True, inplace=True)

df_categories

Unnamed: 0,article_name,article_category,article_category_list,article_category_1,article_category_2,article_category_3
0,Áedán mac Gabráin,subject.History.British History.British Histor...,"[History, British History, British History 150...",History,British History,British History 1500 and before including Roma...
1,Áedán mac Gabráin,subject.People.Historical figures,"[People, Historical figures]",People,Historical figures,
2,Åland,subject.Countries,[Countries],Countries,,
3,Åland,subject.Geography.European Geography.European ...,"[Geography, European Geography, European Count...",Geography,European Geography,European Countries
4,Édouard Manet,subject.People.Artists,"[People, Artists]",People,Artists,
...,...,...,...,...,...,...
5199,Zirconium,subject.Science.Chemistry.Chemical elements,"[Science, Chemistry, Chemical elements]",Science,Chemistry,Chemical elements
5200,Zoroaster,subject.People.Religious figures and leaders,"[People, Religious figures and leaders]",People,Religious figures and leaders,
5201,Zuid-Gelders,subject.Geography.European Geography,"[Geography, European Geography]",Geography,European Geography,
5202,Zuid-Gelders,subject.Language and literature.Languages,"[Language and literature, Languages]",Language and literature,Languages,


# Article links relationship

In [6]:
df_links_path = "data/wikispeedia_paths-and-graph/links.tsv"
df_links = pd.read_csv(df_links_path, sep='\t', header=None, comment='#')

# name a colum
df_links = df_links.rename(columns={0:"link_source", 1:"link_target"})

# Decode the article name and Use " " instead of "_"
df_links.link_source = df_links.link_source.apply(unquote)
df_links.link_source = df_links.link_source.str.replace('_', ' ', regex=False)

df_links.link_target = df_links.link_target.apply(unquote)
df_links.link_target = df_links.link_target.str.replace('_', ' ', regex=False)

df_links


Unnamed: 0,link_source,link_target
0,Áedán mac Gabráin,Bede
1,Áedán mac Gabráin,Columba
2,Áedán mac Gabráin,Dál Riata
3,Áedán mac Gabráin,Great Britain
4,Áedán mac Gabráin,Ireland
...,...,...
119877,Zulu,South Africa
119878,Zulu,Swaziland
119879,Zulu,United Kingdom
119880,Zulu,Zambia


# Path Finished


In [13]:
df_pf_path = "data/wikispeedia_paths-and-graph/paths_finished.tsv"
df_pf = pd.read_csv(df_pf_path, sep='\t', header=None, comment='#')

# name a colum
df_pf = df_pf.rename(columns={0:"hashed_ip", 1:"timestamp", 2:"duration", 3:"path", 4:"rating"})

# create a list of liks for rach path
df_pf['path_list'] = df_pf['path'].str.split(';')

# for each element of the path: 1) decode, 2) substitue "_" with " "
df_pf['path_list'] = df_pf['path_list'].apply(lambda links: [unquote(link.replace('_', ' ')) for link in links])

# source link and target link
df_pf['source_link'] = df_pf['path_list'].apply(lambda x: x[0] if len(x) > 0 else None)  # First element
df_pf['target_link'] = df_pf['path_list'].apply(lambda x: x[-1] if len(x) > 0 else None)  # Last element

# colum for finished or not
df_pf["finished"] = True

# column for cause fo unfinisehd in case (None for finished paths)
df_pf["type_unfinished"] = None

df_pf.head(2)

Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,


# Path Unfinished

In [19]:
df_uf_path = "data/wikispeedia_paths-and-graph/paths_unfinished.tsv"
df_uf = pd.read_csv(df_uf_path, sep='\t', header=None, comment='#')

# name a colum
df_uf = df_uf.rename(columns={0:"hashed_ip", 1:"timestamp", 2:"duration", 3:"path", 4:"target_link", 5:"type_unfinished"})

# create a list of liks for rach path
df_uf['path_list'] = df_uf['path'].str.split(';')

# for each element of the path: 1) decode, 2) substitue "_" with " "
df_uf['path_list'] = df_uf['path_list'].apply(lambda links: [unquote(link.replace('_', ' ')) for link in links])

# source link and target link
df_uf['source_link'] = df_uf['path_list'].apply(lambda x: x[0] if len(x) > 0 else None)  # First element

# colum for finished or not
df_uf["finished"] = False

# Rating columns (not present in unfinsef
df_uf["rating"] = None

# Remove usless cols

df_uf.head(5)

Unnamed: 0,hashed_ip,timestamp,duration,path,target_link,type_unfinished,path_list,source_link,finished,rating
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout,[Obi-Wan Kenobi],Obi-Wan Kenobi,False,
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout,[Julius Caesar],Julius Caesar,False,
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout,"[Malawi, Democracy, Alexander the Great]",Malawi,False,
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart,[Paraguay],Paraguay,False,
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout,"[Paraguay, Bolivia]",Paraguay,False,


# Merge

In [21]:
# Put cols in the same order
cols_name = df_pf.columns.to_list()
df_up = df_uf[cols_name]

# Syack veticllay the 2 df
df_p = pd.concat([df_pf, df_up], axis=0, ignore_index=True)

df_p

  df_p = pd.concat([df_pf, df_up], axis=0, ignore_index=True)


Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[14th century, Niger, Nigeria, British Empire,...",14th century,African slave trade,True,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[14th century, Renaissance, Ancient Greece, Gr...",14th century,Greece,True,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[14th century, Italy, Roman Catholic Church, H...",14th century,John F. Kennedy,True,
...,...,...,...,...,...,...,...,...,...,...
76188,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,,"[Franz Kafka, Tuberculosis, World Health Organ...",Franz Kafka,Cholera,False,restart
76189,232f992e57d43e8d,1389787697,6,Modern_history,,[Modern history],Modern history,Hollandic,False,restart
76190,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,,"[Computer programming, Linguistics, Culture, P...",Computer programming,The_Beatles,False,timeout
76191,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,,"[Jamaica, United Kingdom, World War II, Battle...",Jamaica,Alan_Turing,False,timeout


In [23]:
# Add other metrics

# number cliks
df_p['n_click'] = df_p['path_list'].apply(len)

# Count occurrences of "<" in each list of path_list
df_p['n_back'] = df_p['path_list'].apply(lambda x: len([el for el in x if el == "<"]))

#convert timestamp into date
df_p['date'] = df_p['timestamp'].apply(datetime.datetime.fromtimestamp)

# click rate
df_p['click_rate'] = df_p['n_click']/df_p['duration']
df_p['normalized_duration'] = df_p['duration'] / df_p['n_click']

# freq back click
df_p['freq_back'] = df_p['n_back']/df_p['n_click']

# Save
df_p.to_csv(os.path.join(SAVE_FOLDER, "all_articles_processed.csv"), index = False)

df_p


Unnamed: 0,hashed_ip,timestamp,duration,path,rating,path_list,source_link,target_link,finished,type_unfinished,n_click,n_back,date,click_rate,normalized_duration,freq_back
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,,"[14th century, 15th century, 16th century, Pac...",14th century,African slave trade,True,,9,0,2011-02-15 04:26:49,0.054217,18.444444,0.000
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0,"[14th century, Europe, Africa, Atlantic slave ...",14th century,African slave trade,True,,5,0,2012-08-12 08:36:52,0.056818,17.600000,0.000
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,,"[14th century, Niger, Nigeria, British Empire,...",14th century,African slave trade,True,,8,0,2012-10-03 23:10:40,0.057971,17.250000,0.000
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,,"[14th century, Renaissance, Ancient Greece, Gr...",14th century,Greece,True,,4,0,2010-02-08 08:25:25,0.108108,9.250000,0.000
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0,"[14th century, Italy, Roman Catholic Church, H...",14th century,John F. Kennedy,True,,7,0,2013-04-23 17:27:08,0.040000,25.000000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76188,109ed71f571d86e9,1389787605,180,Franz_Kafka;Tuberculosis;World_Health_Organiza...,,"[Franz Kafka, Tuberculosis, World Health Organ...",Franz Kafka,Cholera,False,restart,8,1,2014-01-15 13:06:45,0.044444,22.500000,0.125
76189,232f992e57d43e8d,1389787697,6,Modern_history,,[Modern history],Modern history,Hollandic,False,restart,1,0,2014-01-15 13:08:17,0.166667,6.000000,0.000
76190,2e09a7224600a7cd,1389798400,1900,Computer_programming;Linguistics;Culture;Popul...,,"[Computer programming, Linguistics, Culture, P...",Computer programming,The_Beatles,False,timeout,5,1,2014-01-15 16:06:40,0.002632,380.000000,0.200
76191,60af9e2138051b96,1389799481,1903,Jamaica;United_Kingdom;World_War_II;Battle_of_...,,"[Jamaica, United Kingdom, World War II, Battle...",Jamaica,Alan_Turing,False,timeout,4,0,2014-01-15 16:24:41,0.002102,475.750000,0.000
