In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime
import warnings
import matplotlib.pyplot as plt

In [2]:
warnings.filterwarnings('ignore')
%matplotlib inline

In [3]:
## It sets True or False values (1/0) to null columns 

def true_false_setter(columns):
    if columns == 'null':
        return 0
    elif pd.isnull(columns):
        return 0
    else:
        return 1

In [None]:
dataset = pd.read_csv("user_data.csv")

In [None]:
dataset.info()

In [None]:
## Check duplicates

print('Unique user_id: {}'.format(dataset['user_id'].nunique()))
print('Unique username: {}'.format(dataset['username'].nunique()))

dataset[dataset.duplicated('username', keep=False)]

In [None]:
## Check NaN values

print(dataset.isna().sum())

## Change social features to 1/0 encoding for further analysis

to_change = ['twitter', 'instagram']
for i in to_change:
    dataset[i] = dataset[i].apply(true_false_setter).astype('int')

## invited_by_user_profile and invited_by_club are our relationship features
## If NaN we have to encode them in a specific label. If "invited_by_user_profile" is NaN then it means this profile was one of
## the original ones. It can be set to 0.

dataset['invited_by_user_profile'] = dataset['invited_by_user_profile'].fillna(0)

## Same reasoning with "invited_by_club", but if not invited by a club the user was invited by another user so we can set it 
## to 0 if NaN

dataset['invited_by_club'] = dataset['invited_by_club'].fillna(0)


In [None]:
## For further analysis support new features are going to be created.

## Shows the date of creation
dataset['date_created'] = pd.to_datetime(dataset['time_created'].str[0:10])

## Auxiliar dataframe for calculating the number of profiles invited by that user
invt_df = pd.DataFrame(dataset['invited_by_user_profile'].value_counts().rename_axis('user_id').reset_index(name='invite_count'))
invt_df = invt_df[invt_df['user_id']!='null']

## Add invite count to original dataset, useful for graph data managment
dataset = dataset.merge(invt_df, how='left', on='user_id')
dataset['invite_count'] = dataset['invite_count'].fillna(0)


In [None]:
## Remove useless columns

dataset.drop(['photo_url','time_created'],axis=1, inplace=True)

In [None]:
dataset.info()
dataset.head()