# Graph Analysis

## Setting up

In [1]:
import pandas as pd 
import datetime

In [9]:
REFERENCE_TIMESTAMP = datetime.datetime( 2022, 4, 1, 0, 0, 0)

## Load and explore data

In [2]:
df = pd.read_csv('2022_place_canvas_history.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160353104 entries, 0 to 160353103
Data columns (total 4 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   timestamp    object
 1   user_id      object
 2   pixel_color  object
 3   coordinate   object
dtypes: object(4)
memory usage: 4.8+ GB


In [3]:
print(df['user_id'].nunique())

10381163


## Clean data

In [4]:
df = df[df['coordinate'].apply( lambda coordinate: (coordinate).count(',')==1)]

In [5]:
df['timestamp'] = df['timestamp'].apply(lambda timestamp: str(timestamp)[:-4])

In [None]:
df = df.sort_values(by=['timestamp'], ascending=True)

In [8]:
df['user_id_num'] = pd.factorize(df['user_id'])[0]
df = df.drop( ['user_id'], axis=1)

In [10]:
def convert_timestamp_to_timedelta( timestamp_str, reference_timestamp):
    parsing_format = "%Y-%m-%d %H:%M:%S.%f" if '.' in timestamp_str else "%Y-%m-%d %H:%M:%S"
    parsed_timestamp = datetime.datetime.strptime(timestamp_str, parsing_format)
    time_delta = (parsed_timestamp - reference_timestamp).total_seconds()
    return time_delta

df['time'] = df['timestamp'].apply( lambda timestamp: convert_timestamp_to_timedelta( str(timestamp), REFERENCE_TIMESTAMP))
df = df.drop( ['timestamp'], axis=1)

In [11]:
def get_coordinate(coordinate_str, dim):
    coordinates = coordinate_str.split(',')
    return int(coordinates[dim])
    
df['X'] = df['coordinate'].apply(lambda coordinate_str: get_coordinate(str(coordinate_str), 0))
df['Y'] = df['coordinate'].apply(lambda coordinate_str: get_coordinate(str(coordinate_str), 1))
df = df.drop( ['coordinate'], axis=1)

In [12]:
df.to_csv( '2022_place_canvas_history_cleared.csv', sep=',', index=False)