In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sqlalchemy import create_engine

In [2]:
warnings.filterwarnings('ignore')

In [3]:
engine = create_engine("mysql+pymysql://root:dushi%401611@localhost:3306/test_database")

In [4]:
catcher_df = pd.read_sql('Select * FROM jobma_catcher', con=engine)  # Done  (about catcher's account)
wallet_df = pd.read_sql('Select * FROM wallet', con=engine)  # Done  ('subscription type, plan type')
subscription_df = pd.read_sql('Select * FROM subscription_history', con=engine)  # Done  (number and amount of subscription bought by the catcher)
invitation_df = pd.read_sql('Select * FROM jobma_pitcher_invitations', con=engine)  # Done   (number of invitations sent by the catcher)
job_posting_df = pd.read_sql('Select * FROM jobma_employer_job_posting', con=engine) # Done    (number of jobs posted by the catcher)
kit_df = pd.read_sql('Select * FROM job_assessment_kit', con=engine) # Done   (number of kits created by the catcher)
recorded_interview_df = pd.read_sql('Select * FROM jobma_interviews', con=engine) # Done (number of recorded interviews created by the catcher)
live_interview_df = pd.read_sql('Select * FROM jobma_interviews_online', con=engine) # Done (number of live interviews created by the catcher)

# Catcher_df

In [6]:
catcher_df = catcher_df[['jobma_catcher_id', 'is_premium', 'jobma_catcher_is_deleted', 'jobma_verified', 'subscription_status', 'company_size']]

In [7]:
catcher_df.drop_duplicates(inplace=True)

# Wallet_df

In [9]:
wallet_df = wallet_df.rename(columns={'catcher_id': 'jobma_catcher_id'})

In [10]:
wallet_df = wallet_df[['jobma_catcher_id', 'subscription_type', 'plan_type', 'is_unlimited']]

In [11]:
wallet_df.drop_duplicates(inplace=True)

In [12]:
wallet_df.columns

Index(['jobma_catcher_id', 'subscription_type', 'plan_type', 'is_unlimited'], dtype='object')

# Subscription_df

In [14]:
subscription_df = subscription_df.rename(columns={'catcher_id': 'jobma_catcher_id'})

In [15]:
subscription_df = subscription_df.groupby('jobma_catcher_id').agg(
    jobma_catcher_sub_accounts=('sub_user_id', 'count'),
    subscription_amount=('subscription_amount', 'sum'),
    number_of_subscriptions=('subscription_amount', 'count')
).reset_index()

In [16]:
subscription_df.drop_duplicates(inplace=True)

# Invitation_df

In [18]:
invitation_df['number_of_invitations'] = invitation_df['jobma_catcher_id'].map(invitation_df['jobma_catcher_id'].value_counts())

In [19]:
invitation_df = invitation_df[['jobma_catcher_id', 'number_of_invitations']]

In [20]:
invitation_df.drop_duplicates(inplace=True)

# Job_posting_df

In [22]:
job_posting_df['job_posted'] = job_posting_df['jobma_catcher_id'].map(job_posting_df['jobma_catcher_id'].value_counts())

In [23]:
job_posting_df = job_posting_df[['jobma_catcher_id', 'job_posted']]

In [24]:
job_posting_df.drop_duplicates(inplace=True)

# Kit_df

In [26]:
kit_df = kit_df.rename(columns={'catcher_id':'jobma_catcher_id'})

In [27]:
kit_df['number_of_kits'] = kit_df['jobma_catcher_id'].map(kit_df['jobma_catcher_id'].value_counts())

In [28]:
kit_df = kit_df[['jobma_catcher_id', 'number_of_kits']]

In [29]:
kit_df.drop_duplicates(inplace=True)

# Recorded_Interview_df

In [31]:
recorded_interview_df.columns

Index(['id', 'jobma_post_id', 'jobma_pitcher_id', 'jobma_catcher_id',
       'interview_applied_id', 'jobma_invitation_id',
       'jobma_interview_question_id', 'jobma_answers', 'jobma_video_answer',
       'retake_requested', 'payment_status', 'interview_rate',
       'interview_credit', 'view_date', 'subscription_type',
       'interview_cost_type', 'evaluated_by', 'is_streaming', 'deleted_at',
       'created_at', 'updated_at'],
      dtype='object')

In [32]:
recorded_interview_df['number_of_recorded_interviews'] = recorded_interview_df['jobma_catcher_id'].map(recorded_interview_df['jobma_catcher_id'].value_counts())

In [33]:
recorded_interview_df = recorded_interview_df[['jobma_catcher_id', 'number_of_recorded_interviews']]

In [34]:
recorded_interview_df.drop_duplicates(inplace=True)

# Live_Interview_df

In [36]:
live_interview_df.columns

Index(['id', 'invitation_id', 'jobma_pitcher_id', 'jobma_job_post_id',
       'jobma_catcher_id', 'video_names', 'interview_status', 'payment_status',
       'record', 'interview_rate', 'interview_credit', 'video_time',
       'view_date', 'subscription_type', 'interview_cost_type',
       'video_request_status', 'deleted_at', 'created_at', 'updated_at'],
      dtype='object')

In [37]:
live_interview_df['number_of_live_interviews'] = live_interview_df['jobma_catcher_id'].map(live_interview_df['jobma_catcher_id'].value_counts())

In [38]:
live_interview_df = live_interview_df[['jobma_catcher_id', 'number_of_live_interviews']]

In [39]:
live_interview_df.drop_duplicates(inplace=True)

# Merging Dataframes

In [41]:
print(f'Catcher df shape is {catcher_df.shape}')
print(f'Wallet df shape is {wallet_df.shape}')
print(f'Subscription df shape is {subscription_df.shape}')
print(f'Invitation df shape is {invitation_df.shape}')
print(f'Job Posting df shape is {job_posting_df.shape}')
print(f'Number of Kit df shape is {kit_df.shape}')
print(f'Number of Recorded Interview df shape is {recorded_interview_df.shape}')
print(f'Number of Live Interview df shape is {live_interview_df.shape}')

Catcher df shape is (6116, 6)
Wallet df shape is (4479, 4)
Subscription df shape is (4477, 4)
Invitation df shape is (1171, 2)
Job Posting df shape is (1127, 2)
Number of Kit df shape is (1192, 2)
Number of Recorded Interview df shape is (857, 2)
Number of Live Interview df shape is (57, 2)


In [42]:
final_df = catcher_df.copy()

# Left join each table one by one
final_df = final_df.merge(wallet_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(subscription_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(invitation_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(job_posting_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(kit_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(recorded_interview_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(live_interview_df, on='jobma_catcher_id', how='left')
final_df.drop_duplicates(inplace=True)

print(f"Final merged df shape is {final_df.shape}")

Final merged df shape is (6116, 17)


In [43]:
final_df.head()

Unnamed: 0,jobma_catcher_id,is_premium,jobma_catcher_is_deleted,jobma_verified,subscription_status,company_size,subscription_type,plan_type,is_unlimited,jobma_catcher_sub_accounts,subscription_amount,number_of_subscriptions,number_of_invitations,job_posted,number_of_kits,number_of_recorded_interviews,number_of_live_interviews
0,2656,0,,1,1,1-25,0,0,1,0.0,15000.0,1.0,4.0,1.0,2.0,1.0,
1,2935,0,,1,2,26-100,0,1,0,0.0,53.1,1.0,,,,,
2,2937,0,,1,2,101-500,0,1,0,0.0,118.0,1.0,,,,,
3,2938,0,,1,1,26-100,0,1,0,0.0,15200.18,3.0,,1.0,,,
4,2939,0,,1,2,26-100,0,1,0,0.0,11800.0,1.0,,1.0,,,


In [45]:
final_df['jobma_catcher_sub_accounts'].unique()

array([  0.,   4.,  nan,   1.,   9.,   2.,   3.,   5.,   6.,  23., 115.,
        24.,   7.,   8.,  11.,  12.,  14.,  10.,  15.])

# Exporting

In [52]:
final_df.to_csv("try_data.csv", index=False)