In [1]:
import pandas as pd
import numpy as np
import warnings
from sqlalchemy import create_engine

In [2]:
warnings.filterwarnings('ignore')

In [3]:
engine = create_engine("mysql+pymysql://root:dushi%401611@localhost:3306/jobma_database")

In [4]:
'''Filling Missing Values'''

def fill_missing_values(df):
    df = df.copy()
    for col in df.columns:
        if df[col].dtype.kind in ['i']:  # integer
            df[col] = df[col].fillna(0)
        elif df[col].dtype.kind in ['f']:  # float
            df[col] = df[col].fillna(0.0)
        elif df[col].dtype == object:
            df[col] = df[col].fillna('Unknown')
        else:
            df[col] = df[col].fillna(df[col].mode()[0])  # fallback
    return df

In [5]:
catcher_df = pd.read_sql('Select * FROM jobma_catcher', con=engine) # Done
wallet_df = pd.read_sql('Select * FROM wallet', con=engine) # Done 
subscription_df = pd.read_sql('Select * FROM subscription_history', con=engine) # Done
invitation_df = pd.read_sql('Select * FROM jobma_pitcher_invitations', con=engine) # Done
job_posting_df = pd.read_sql('Select * FROM jobma_employer_job_posting', con=engine) # Done
kit_df = pd.read_sql('Select * FROM job_assessment_kit', con=engine) # Done

# Catcher DataFrame

In [7]:
catcher_df.shape[1]

75

In [8]:
catcher_df.columns

Index(['jobma_catcher_id', 'jobma_catcher_fname', 'jobma_catcher_lname',
       'jobma_catcher_dob', 'jobma_catcher_video_status',
       'jobma_catcher_company', 'org_type', 'sec_ques_id', 'security_ans',
       'jobma_about_company', 'jobma_functional', 'jobma_catcher_indus',
       'jobma_company_email', 'jobma_catcher_email', 'jobma_catcher_title',
       'jobma_catcher_ext', 'jobma_catcher_otype', 'jobma_catcher_photo',
       'jobma_catcher_logo', 'jobma_catcher_website', 'jobma_company_address',
       'jobma_catcher_address2', 'jobma_catcher_video', 'jobma_catcher_city',
       'jobma_catcher_state', 'jobma_catcher_country', 'jobma_catcher_zip',
       'jobma_catcher_fax', 'jobma_catcher_phone', 'jobma_catcher_status',
       'jobma_catcher_creation', 'jobma_catcher_type',
       'jobma_catcher_sub_accounts', 'is_premium', 'jobma_catcher_parent',
       'jobma_catcher_is_deleted', 'jobma_verified', 'jobma_in_index',
       'data_access', 'subscription_status', 'interview_rate',

In [9]:
catcher_df['jobma_catcher_sub_accounts'].unique()

array(['0', '', '49', '7'], dtype=object)

In [10]:
catcher_df.rename(columns= {'jobma_catcher_indus' : 'jobma_catcher_industry'}, inplace=True)

In [11]:
catcher_df = catcher_df[['jobma_catcher_id', 'org_type', 'jobma_catcher_industry', 'jobma_catcher_type', 'is_premium', 'jobma_catcher_sub_accounts',
       'jobma_catcher_is_deleted', 'jobma_verified',
       'subscription_status', 'interview_rate', 'live_interview_credit',
       'pre_recorded_credit', 'ai_live_interview_credit', 'credit_value',
       'interview_cost_type', 'subscription_type',
       'jobma_support_rtc', 'interview_question', 'video_recording_suppport',
       'sing_up_canditate_after_apply', 'currency', 'referral_credit',
       'company_size']]

In [12]:
catcher_df.shape

(7672, 23)

In [13]:
catcher_df['jobma_catcher_sub_accounts'].unique()

array(['0', '', '49', '7'], dtype=object)

In [14]:
catcher_df['jobma_catcher_sub_accounts'] = pd.to_numeric(
    catcher_df['jobma_catcher_sub_accounts'], errors='coerce'
).astype('Int64')

In [15]:
catcher_df['jobma_catcher_sub_accounts'].unique()

<IntegerArray>
[0, <NA>, 49, 7]
Length: 4, dtype: Int64

In [16]:
''' Replacing 0 with No, 1 with Yes and Null Values with No in Jobma_Catcher_is_Deleted '''

catcher_df['jobma_catcher_is_deleted'] = catcher_df['jobma_catcher_is_deleted'].replace({'0':'No', '1':'Yes', np.nan:'No'})

In [17]:
''' Filling Null Values in Company_Size with Unknown '''

# catcher_df['company_size'].fillna('Unknown', inplace=True)
mode_value = catcher_df['company_size'].mode()[0]
catcher_df['company_size'].fillna(mode_value, inplace=True)
catcher_df['jobma_catcher_type'].fillna('0', inplace=True)

In [18]:
''' Filling Missing Values '''
catcher_df = fill_missing_values(catcher_df)

In [19]:
catcher_df['jobma_catcher_sub_accounts'].unique()

<IntegerArray>
[0, 49, 7]
Length: 3, dtype: Int64

# Wallet DataFrame

In [21]:
wallet_df.columns

Index(['id', 'catcher_id', 'credit_amount', 'wallet_amount',
       'subscription_type', 'plan_type', 'is_unlimited', 'premium_storage',
       'trial_account_type', 'gst_code', 'billing_cycle',
       'billing_cycle_status', 'created_at', 'updated_at'],
      dtype='object')

In [22]:
wallet_df = wallet_df[['catcher_id', 'credit_amount', 'wallet_amount',
       'subscription_type', 'plan_type', 'is_unlimited', 'premium_storage']]

In [23]:
yes_no_columns = ['subscription_type', 'plan_type', 'is_unlimited', 'premium_storage']

for col in yes_no_columns:
    wallet_df[col] = wallet_df[col].replace({'0':'No', '1':'Yes'})

In [24]:
wallet_df = wallet_df.rename(columns={'catcher_id':'jobma_catcher_id'})

In [25]:
wallet_df.isnull().sum()

jobma_catcher_id        0
credit_amount        5397
wallet_amount           0
subscription_type       0
plan_type               0
is_unlimited            0
premium_storage         0
dtype: int64

In [26]:
''' Filling Missing Values '''
wallet_df = fill_missing_values(wallet_df)

In [27]:
wallet_df.duplicated().sum()

1

In [28]:
wallet_df.drop_duplicates('jobma_catcher_id', inplace=True)

In [29]:
wallet_df.duplicated().sum()

0

# Kit DataFrame
**To fetch number of kits created by catcher**

In [31]:
kit_df = kit_df.rename(columns={'catcher_id':'jobma_catcher_id'})

In [32]:
kit_df['number_of_kits'] = kit_df['jobma_catcher_id'].map(kit_df['jobma_catcher_id'].value_counts())

In [33]:
kit_df = kit_df[['jobma_catcher_id', 'number_of_kits']]

In [34]:
kit_df.shape

(5852, 2)

In [35]:
kit_df.drop_duplicates(inplace=True)

In [36]:
kit_df.head()

Unnamed: 0,jobma_catcher_id,number_of_kits
0,908,44
1,1209,69
5,1235,9
6,1040,4
10,1268,2


In [37]:
kit_df.shape

(1082, 2)

In [38]:
kit_df['number_of_kits'].unique()

array([ 44,  69,   9,   4,   2,  24,   1,  12,   8,  23, 145,  48,  11,
        57,  59,   3,   5,   6,  61,  35,  17,  36, 106,   7,  85,  37,
       151,  10,  29,  34, 131,  13,  16,  50, 172,  15,  27,  71,  31,
       124,  80,  41,  75, 196,  18,  20,  67,  96,  82,  19,  97,  55,
        42,  90,  25,  14,  46,  84, 297,  30], dtype=int64)

In [39]:
kit_df[kit_df['number_of_kits'] == 297]

Unnamed: 0,jobma_catcher_id,number_of_kits
5200,1873,297


# Invitation DataFrame
**To fetch number of invitations sent by the catcher**

In [41]:
invitation_df['number_of_invitations'] = invitation_df['jobma_catcher_id'].map(invitation_df['jobma_catcher_id'].value_counts())

In [42]:
invitation_df = invitation_df[['jobma_catcher_id', 'number_of_invitations']]

In [43]:
invitation_df.shape

(54691, 2)

In [44]:
invitation_df.drop_duplicates(inplace=True)

In [45]:
invitation_df.shape

(1018, 2)

# Job Posting DataFrame
**To fetch number of jobs posted by the Catcher**

In [47]:
job_posting_df.columns

Index(['jobma_job_post_id', 'taleo_job_id', 'jobma_catcher_id',
       'job_assessment_kit_id', 'ai_live_kit_id', 'jobma_job_title', 'slug',
       'jobma_job_description', 'jobma_job_notes', 'jobma_job_type',
       'job_dur_year', 'job_dur_month', 'jobma_job_min_exp',
       'jobma_job_max_exp', 'jobma_job_min_salary', 'jobma_job_max_salary',
       'jobma_job_sal_negotiable', 'jobma_job_sal_comments',
       'jobma_job_qualification', 'jobma_job_locations', 'jobma_job_industry',
       'jobma_job_functional_areas', 'jobma_job_keywords',
       'jobma_job_currency', 'jobma_job_salary_type',
       'jobma_job_recruiter_name', 'jobma_job_recruiter_email',
       'jobma_job_recruiter_phone', 'jobma_job_recruiter_ext',
       'jobma_job_recruiter_website', 'jobma_job_company_name',
       'jobma_job_company_profile', 'jobma_job_company_website',
       'jobma_job_company_url_redirect', 'jobma_job_notice_period',
       'jobma_job_expiry_date', 'jobma_job_apply_count',
       'jobma_job_v

In [48]:
job_posting_df['job_posted'] = job_posting_df['jobma_catcher_id'].map(job_posting_df['jobma_catcher_id'].value_counts())

In [49]:
job_posting_df = job_posting_df[['jobma_catcher_id', 'job_posted']]

In [50]:
job_posting_df.shape

(16742, 2)

In [51]:
job_posting_df.drop_duplicates(inplace=True)

In [52]:
job_posting_df.shape

(1292, 2)

# Subscription DataFrame

In [54]:
subscription_df.rename(columns={'catcher_id':'jobma_catcher_id'}, inplace=True)

In [55]:
subscription_df.columns

Index(['id', 'jobma_catcher_id', 'catcher_username', 'catcher_email',
       'sub_user_id', 'subscription_id', 'premium_plan_id', 'premium_plan',
       'transaction_id', 'subscription_amount', 'credit_given',
       'paypal_profile_id', 'paypal_payer_id', 'expiry_date', 'currency',
       'premium_storage', 'igst', 'cgst', 'gst_code', 'currency_val',
       'recurring_data', 'payment_mode', 'status', 'payment_status',
       'e_invoice', 'cheque_number', 'banker', 'cheque_amount', 'cheque_image',
       'cancel_date', 'invoice_suffix', 'radioGstValue', 'created_at',
       'updated_at'],
      dtype='object')

In [56]:
subscription_df = subscription_df.groupby('jobma_catcher_id').agg(
    subscription_amount=('subscription_amount', 'sum'),
    number_of_subscriptions=('subscription_amount', 'count')
).reset_index()

In [57]:
subscription_df.columns

Index(['jobma_catcher_id', 'subscription_amount', 'number_of_subscriptions'], dtype='object')

In [58]:
subscription_df.duplicated().sum()

0

In [59]:
subscription_df[subscription_df['jobma_catcher_id'] == 92]

Unnamed: 0,jobma_catcher_id,subscription_amount,number_of_subscriptions
0,92,199.0,2


# Merging DataFrames

In [61]:
print(f'Catcher df shape is {catcher_df.shape}')
print(f'Wallet df shape is {wallet_df.shape}')
print(f'Subscription df shape is {subscription_df.shape}')
print(f'Invitation df shape is {invitation_df.shape}')
print(f'Job Posting df shape is {job_posting_df.shape}')
print(f'Number of Kit df shape is {kit_df.shape}')

Catcher df shape is (7672, 23)
Wallet df shape is (5560, 7)
Subscription df shape is (5560, 3)
Invitation df shape is (1018, 2)
Job Posting df shape is (1292, 2)
Number of Kit df shape is (1082, 2)


In [62]:
final_df = catcher_df.copy()

# Left join each table one by one
final_df = final_df.merge(wallet_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(subscription_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(invitation_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(job_posting_df, on='jobma_catcher_id', how='left')
final_df = final_df.merge(kit_df, on='jobma_catcher_id', how='left')

print(f"Final merged df shape is {final_df.shape}")

Final merged df shape is (7672, 34)


In [63]:
final_df.to_csv("extracted_data1.csv", index=False)