# Pandas Homework Part 2

`pandas` version

In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
columns = ['prev', 'curr', 'type', 'n']

In [19]:
%%time
# 1, 2

for country in ["de", "pl"]:
    df = pd.read_csv(f"../data/wikipedia/clickstream-{country}wiki-2022-03.tsv.gz", sep="\t", names=columns, on_bad_lines='warn', quoting=3)
    s = df.query("type == 'external'").groupby("curr")['n'].sum().sort_values(ascending=False).head().index[0]
    print(country, s)

de Ukraine
pl Ukraina
CPU times: user 13.3 s, sys: 304 ms, total: 13.6 s
Wall time: 13.7 s


In [5]:
badges = pd.read_xml("../data/travel/travel.stackexchange.com/Badges.xml")
posts = pd.read_xml("../data/travel/travel.stackexchange.com/Posts.xml", parser='etree')
tags =  pd.read_xml("../data/travel/travel.stackexchange.com/Tags.xml", parser='etree')
users =  pd.read_xml("../data/travel/travel.stackexchange.com/Users.xml", parser='etree')
votes =  pd.read_xml("../data/travel/travel.stackexchange.com/Votes.xml", parser='etree')

In [6]:
wiki = pd.read_csv(f"../data/wikipedia/clickstream-enwiki-2022-03.tsv.gz", sep="\t", names=columns, quoting=3)

In [7]:
%%time
# 3, 4
tid = badges.merge(users, left_on="UserId", right_on="Id").groupby("UserId").size().sort_values().index[-1]
top_user = users.loc[users["Id"] == tid, :]
top_user = top_user.reset_index(drop=True).loc[0, ['DisplayName', 'Location']]
top_user

CPU times: user 166 ms, sys: 3.64 ms, total: 170 ms
Wall time: 169 ms


DisplayName                    Mark Mayo
Location       Christchurch, New Zealand
Name: 0, dtype: object

In [8]:
%%time
# 5
city = top_user['Location'].split(", ")[0]
wiki.loc[wiki['curr'] == city, :]['n'].sum()

CPU times: user 1.4 s, sys: 67.7 ms, total: 1.46 s
Wall time: 1.45 s


25804

In [9]:
# Not that this part can be done in many ways
# this focuses on showing how to work with apply in non-standard way

# https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    if isinstance(raw_html, str):
        cleantext = re.sub(CLEANR, '', raw_html)
        return cleantext
    else:
        return raw_html

In [10]:
def aux(l):
    d = defaultdict(int)
    if isinstance(l, list):
        for w in l:
            d[w.lower()] += 1
    return d

In [11]:
%%time
# 6, 7
dicts = posts['Body'].apply(cleanhtml).str.replace("\n", " ").str.split(" ").apply(aux)
# Even better solution:
# dicts = posts['Body'].str.replace('<.*?>', "", regex=True).str.replace("\n", " ").str.split(" ").apply(aux)
big_d = defaultdict(int)
for d in dicts:
    for k, v in d.items():
        big_d[k] += v

s = pd.Series(big_d, name="Count").reset_index()
s.rename(columns={'index':'Word'}, inplace=True)

words = s.loc[s.Word.str.len() > 7, :].sort_values("Count", ascending=False).head()

# 3 points
theword = words['Word'].iloc[0]
theword, wiki.query("curr == @theword.capitalize()")['n'].sum()

CPU times: user 9.94 s, sys: 722 ms, total: 10.7 s
Wall time: 10.7 s


('passport', 31631)

In [14]:
%%time
# 6, 7
# Just different approach
words = (
    posts['Body']
    .str.replace('<.*?>', "", regex=True)
    .str.replace("\n", " ")
    .str.split(" ")
    .explode()
    .str.lower()
)
theword = words[words.str.len() > 7].value_counts().head(1).index[0]
theword, wiki.query("curr == @theword.capitalize()")['n'].sum()

CPU times: user 12.8 s, sys: 1.53 s, total: 14.4 s
Wall time: 14.4 s


('passport', 31631)

In [15]:
%%time
# 8, 9
upvotes = (
    votes
    .query('VoteTypeId == 2')
    .groupby("PostId")
    .size()
    .reset_index(name="UpVotes")
)
downvotes = (
    votes
    .query('VoteTypeId == 3')
    .groupby("PostId")
    .size()
    .reset_index(name="DownVotes")
)

posts2 = (
    posts
    .merge(upvotes, left_on="Id", right_on="PostId", how='left')
    .merge(downvotes, left_on="Id", right_on="PostId", how='left')
)
posts2.loc[:, ['UpVotes', 'DownVotes']] = posts2.loc[:, ['UpVotes', 'DownVotes']].fillna(value=0)

posts2['UpVoteRatio'] = posts2['UpVotes'] - posts2['DownVotes']

(
    posts2
    .merge(users, left_on="OwnerUserId", right_on="Id")
    .sort_values("UpVoteRatio", ascending=False)
    .reset_index(drop=True)
    .loc[0, ['Score', 'DisplayName']]
)

CPU times: user 933 ms, sys: 11.9 ms, total: 945 ms
Wall time: 943 ms


Score                     547
DisplayName    Andrew Lazarus
Name: 0, dtype: object

In [16]:
%%time
# 10
votes
votes['CreationDateDT'] = pd.to_datetime(votes['CreationDate'])
votes.set_index("CreationDateDT", inplace=True)

votesagg = votes.groupby(pd.Grouper(freq="M")).size()

votesagg.sort_values(ascending=False).index[0]


CPU times: user 212 ms, sys: 8.26 ms, total: 220 ms
Wall time: 219 ms


Timestamp('2016-08-31 00:00:00')

In [17]:
%%time
# 11
# votesagg is sorted by index (CreationDateDT) 
votesagg.diff().sort_values().index[0]

CPU times: user 0 ns, sys: 1.9 ms, total: 1.9 ms
Wall time: 1.73 ms


Timestamp('2015-10-31 00:00:00')

In [18]:
%%time
# 12

posts3 = posts.merge(users, left_on="OwnerUserId", right_on="Id")
tags = posts3.loc[
    posts3['Location'].str.contains("Poland") | 
    posts3['Location'].str.contains("Polska"), 
    'Tags'
]
(
    tags
    .str.strip("<")
    .str.strip(">")
    .str.split("><")
    .dropna()
    .explode()
    .value_counts()
    .head(1)
)

CPU times: user 452 ms, sys: 37 µs, total: 452 ms
Wall time: 451 ms


air-travel    34
Name: Tags, dtype: int64