In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# Step 1: Data Description and Objective

The dataset was released by Aspiring Minds from the Aspiring Mind Employment Outcome 2015 (AMEO). The study is primarily limited  only to students with engineering disciplines. The dataset contains the employment outcomes of engineering graduates as dependent variables (Salary, Job Titles, and Job Locations) along with the standardized scores from three different areas – cognitive skills, technical skills and personality skills. The dataset also contains demographic features. The dataset  contains  around  40 independent variables and 4000 data points. The independent variables are both continuous and categorical in nature. The dataset contains a unique identifier for each candidate. Below mentioned table contains the details for the original dataset. 



# Step 2: Import the Data and display the Head, Shape, and Description of the Data

In [None]:

df = pd.read_csv("aspiring_minds_employability_outcomes_2015.csv")
print(df.head())
print(df.shape)
print(df.describe())


In [None]:
df.columns

In [None]:
from datetime import date
# define a lambda function to calculate age from DOB
def calculate_age(dob):
    dob_date = pd.to_datetime(dob, format='%d-%m-%Y %H:%M').date()
    today = date.today()
    age = today.year - dob_date.year - ((today.month, today.day) < (dob_date.month, dob_date.day))
    return age

In [None]:
# apply the lambda function to create a new 'age' column
df['age'] = df['DOB'].apply(lambda x: calculate_age(x))
print(df)


# Step 3: Univariate Analysis
'''
Perform univariate analysis on numerical columns using PDF, Histograms,
Boxplots, etc. to understand the probability and frequency distribution
of each numerical column. Find the outliers and mention observations.
Similarly, perform univariate analysis on categorical variables/columns
to understand the frequency distribution of each categorical column and
mention observations.
'''


In [None]:
# PDF and Histogram
num_cols = df.select_dtypes(include=np.number).columns
for col in num_cols:
    sns.distplot(df[col])
    plt.show()

# Boxplot
for col in num_cols:
    sns.boxplot(df[col])
    plt.show()





In [None]:
# # Countplot
# df=df.drop(df['unnamed'])
# cat_cols = df.select_dtypes(include='object').columns
# for col in cat_cols:
#     sns.countplot(df[col])
#     plt.show()

# Step 4: Bivariate Analysis
'''
Discover the relationships between numerical columns using scatter plots,
hexbin plots, pair plots, etc. Identify patterns between categorical and
numerical columns using swarmplot, boxplot, barplot, etc. Mention observations
after each plot.
'''

In [None]:


# Scatter plot
sns.scatterplot(x='age', y='Salary', data=df)
plt.show()




In [None]:
# Swarm plot
sns.set(style="whitegrid")
ax = sns.stripplot(x='Gender', y='Specialization', data=df)
plt.figure(figsize=(8,6))
ax.set_xlabel('Gender', fontsize=14)
ax.set_ylabel('Specialization', fontsize=5)  # set the font size
plt.show()

In [None]:
df.columns

In [None]:
df.head(2)

# Step 5: Research Questions

In [None]:
# Question 1: Testing the claim made by "Times of India" article dated Jan 18, 2019.
# Test if taking up jobs as a Programming Analyst, Software Engineer, Hardware Engineer, and Associate Engineer can earn up to 2.5-3 lakhs as a fresh graduate.

salary_range = df[(df['JobCity'] == 'Bangalore') & (df['Specialization'].isin(['Computer Science', 'Electronics']))]['Salary']

if salary_range.between(250000, 300000).all():
    print("The claim made by 'Times of India' article dated Jan 18, 2019 has been verified.")
else:
    print("The claim made by 'Times of India' article dated Jan 18, 2019 is not valid.")

In [None]:
# Question 2: Is there a relationship between gender and specialization?
# Determine if the preference of specialization depends on the gender.

sns.catplot(x="Gender", y="Salary", hue="Specialization", data=df, kind="box")
plt.show()


In [None]:
sns.catplot(x="Gender", y="Degree", hue="Specialization", data=df, kind="strip")
plt.figure(figsize=(8,6))
ax.set_xlabel('Gender', fontsize=14)
ax.set_ylabel('Degree', fontsize=5)  # set the font size
plt.show()

In [None]:
# Step 7: Feature Transformation

# For Numerical Features: Standardization
from sklearn.preprocessing import StandardScaler

num_cols = df.select_dtypes(include=np.number).columns
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])


In [None]:

# For Categorical Features: Dummy Variables
df.shape
df = pd.get_dummies(df, columns = ['Designation', 'JobCity', '10board', '12board', 'Degree', 'Specialization', 'CollegeState'])
df.shape
df['Gender'] = df['Gender'].map({'m' : 1, 'f' : 0})
df = df.drop(['Unnamed: 0','DOJ','DOL','DOB'], axis = 1)
df.shape
df.head(5)