#### <u> <span style="color:#42abc6"> Importing Necessary Libraries

In [None]:

import numpy as np
import pandas as pd
import  matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_rows = 4000
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format',lambda x:'%.3f'%x)
sns.set()

#### <u> <span style="color:#f3bac5"> Load the Dataset

In [None]:
df=pd.read_csv("../input/aspiring-minds-employability-outcomes-2015/aspiring_minds_employability_outcomes_2015.csv")
df.head()

#### Columns In DataFrame

In [None]:
df.columns[:]

#### <u> <span style="color:#acbf52"> Statistical Description of DataFrame

In [None]:
df.describe()

#### <u> <span style="color:#1432ff"> Looking for Missing Values 

In [None]:
df.isnull().sum()

#### <u> Dropping the Unnecessary Columns

In [None]:
df.drop(['Unnamed: 0','ID','10board','12board','CollegeID','CollegeCityID'],axis=1,inplace=True)
df.head()

### <u>Getting the Year from the Date of Join for yearwise joining data

In [None]:
df['dyear']=pd.DatetimeIndex(df['DOJ']).year


#### <u> Subjectwise Box-plot for AMCAT Test to find the outliers

In [None]:
f, axes = plt.subplots(8, 2,figsize=(18,80))
j=0
k=0
for i in range(17,33):
   # sns.boxplot(x=df.iloc[:,i],orient='v' , ax=axes[j,k])

    if(i%2==0):
        k=1   
    elif(i%2!=0 and i>17):
        j=j+1
        k=0
    sns.boxplot(x=df.iloc[:,i],orient='v' , ax=axes[j,k])


    

#### <u>Outliers for English Subject

In [None]:
df['English'][(df['English']< 220) | (df['English'] > 790)].reset_index()

#### <u>Outliers for Logical Subject

In [None]:
df['Logical'][(df['Logical']< 240) | (df['Logical'] > 790)].reset_index()

#### <u>Outliers for Quant Subject

In [None]:
df['Quant'][(df['Quant'] < 210) | (df['Quant'] > 840)].value_counts().reset_index()

#### <u>Outliers for Domain Subject

In [None]:
df['Domain'][df['Domain'] <= -1].value_counts().reset_index()

#### <u>Outliers for ComputerProgramming Subject

In [None]:
df['ComputerProgramming'][df['ComputerProgramming'] > 800].reset_index()

#### <u>Outliers for ElectronicsAndSemicon Subject

In [None]:
df['ElectronicsAndSemicon'][df['ElectronicsAndSemicon'] > 600].reset_index()

#### Frequency Distribution of ComputerScience

In [None]:
# df['ComputerScience'][df['ComputerScience'] > 100].reset_index()
df['ComputerScience'].value_counts()

#### Frequency Distribution of MechanicalEngg

In [None]:
df['MechanicalEngg'][df['MechanicalEngg'] > 160].value_counts().reset_index()

#### Frequency Distribution of ElectricalEngg

In [None]:
df['ElectricalEngg'][df['ElectricalEngg'] > 200].value_counts().reset_index()

#### Frequency Distribution of TelecomEngg

In [None]:
df['TelecomEngg'][df['TelecomEngg'] > 150].value_counts().reset_index()

#### Frequency Distribution of CivilEngg

In [None]:
# df['CivilEngg'][df['CivilEngg'] > 100].reset_index()
df['CivilEngg'].value_counts()

### From the Above Boxplot we can clearly see that there are some outliers in the columns

#### <u> Distribution plot for Academic Marks (10,12,grad,Sum of Amcat marks)

In [None]:
f, axes = plt.subplots(2, 2,figsize=(18,20))
df['Total_amcat']=df.iloc[:,17:].sum(axis=1)
sns.distplot(x=df['10percentage'] , ax=axes[0,0])
sns.distplot(x=df['12percentage'] , ax=axes[0,1])
sns.distplot(x=df['collegeGPA'] , ax=axes[1,0])
sns.distplot(x=df['Total_amcat'] , ax=axes[1,1])
axes[0,0].set_title("10percentage")
axes[0,1].set_title("12percentage")
axes[1,0].set_title("collegeGPA")
axes[1,1].set_title("Total_amcat")



#### <u> Yearwise Joining of Employees Group by Gender

In [None]:
sns.catplot(x = "dyear",hue="Gender",data = df, palette = 'magma',kind='count')
plt.xticks(rotation=90)

#### <u> Customize Module for Removing the TimeStamps from the Dates

In [None]:
def stamp(col):
    for i in range(len(df[col])):
        if(df[col][i]!='present'):
            df[col][i]=df[col][i].replace('0:00','').strip()

        

#### <u><span style="color:#52ab65"> Removing TimeStamps from Dates of DOJ,DOL,DOB

In [None]:
stamp("DOJ")
stamp("DOL")
stamp("DOB")


#### <u> Detecting the Outliers in Salary by Applying Z-Score Method. Thresohold=3

In [None]:
from scipy import stats
import numpy as np
z = np.abs(stats.zscore(df.Salary))
#Adding Threshold to 3
threshold = 3
p=list(np.where(z > 3))

p # we have the indexes that contains the outliers

#So we will remove that rows so that our dataset contains outlier free
outlier=[df['Salary'][i] for i in p]

#### <u>Distribution plot for Yearwise Joining

In [None]:
sns.distplot(df['dyear'])

In [None]:
sns.boxplot(x=df['collegeGPA'],orient='v')


#### <span style="color:red"> From the above the plot we can clearly see that there are outliers which lies under 20

#### Outliers for the CollegeGPA

In [None]:
df.collegeGPA[df['collegeGPA']<20].reset_index().drop(['index'],axis=1)

### Univariate Analysis of 10 $^{th}$ and 12$^{th}$ Marks

In [None]:
sns.boxplot(x=df['10percentage'],orient='v')


#### <u> Outliers for the 10percentage column

In [None]:
df['10percentage'][df['10percentage']<52].value_counts().reset_index()

In [None]:
sns.boxplot(x=df['12percentage'],orient='v')


In [None]:
df['12percentage'][df['12percentage']<41].reset_index()

#### <u>Average Salary By Designation

In [None]:
df.groupby('Designation')['Salary'].mean()


### Co-realtion Between our DataFrame Columns lies between -1 to +1; where +1 means strongly co-related

In [None]:
df.corr()

#### Filtering the Co-relation between columns using the threshold of 0.5 

In [None]:
c = df.corr().abs()
s = c.unstack()
so = s.sort_values(kind="quicksort")
so[(so>0.5) & (so<1)]

### <u> Co-relation HeatMap

In [None]:
sns.heatmap(df.corr())

### Scatter and Joint-plot of Highly Co-related Features

In [None]:
sns.scatterplot(data=df, x="10percentage", y="12percentage",hue="Gender")
sns.jointplot(data=df, x="collegeGPA", y="Salary", kind='hex')
sns.jointplot(data=df, x="Total_amcat", y="Salary", kind='hex')
sns.jointplot(data=df, x="GraduationYear", y="Salary", kind='hex')

### Swarmplot Commented because consuming much Time

In [None]:
# sns.swarmplot(x="Total_amcat",
#                   y="Salary",
#                   data=df)

### One-hot Encoding of Gender column, Because Machines are good with numbers not Texts

In [None]:
df['male']=pd.get_dummies(df.Gender,drop_first=True)

### Fresh Statistical Description of the DataFrame

In [None]:
df.describe()

### <u> Fetching the Years from the DOJ=Date of joining for better visualization. And saving it to the 'dyear' column in the DataFrame

In [None]:
df['dyear']=pd.DatetimeIndex(df['DOJ']).year

###  <u><span style="color:#abcf43">Standardization of Salary Column

In [None]:
from sklearn import preprocessing

x =df[['Salary']] #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_normalized = pd.DataFrame(x_scaled)
df['n_sal']=df_normalized

#### Yearwise Salary of Employee During Joining Year

In [None]:
sns.barplot(x='dyear',y='n_sal',data=df)

### <u>Year-wise salary group by Gender

In [None]:
sns.barplot(x='dyear',y='n_sal',data=df,hue='male',palette = 'magma')
plt.xlabel("Joining Year")
plt.ylabel("Normalized Salary")


plt.show()

#### From the above plot we can see that 1991 and 2004 there were no Male Employees in the Company :{)

### <u><span style="color:#aaff22"> Year-wise New joining : - 

In [None]:
df['dyear'].value_counts()

### <u><span style="color:#fab222"> Fetching Year from DOL=Date of Leaving Column :

In [None]:
df['leaving_yr']=0
for i in range(len(df['DOL'])):
    if(df['DOL'][i]=='present'):
        df['leaving_yr'][i]=0
    else:
        d = re.findall('(\d{4})',df['DOL'][i])
        df['leaving_yr'][i]=d[0]

In [None]:
df['leaving_yr'].value_counts()

#### Joing vs Leaving the company Employees

In [None]:
print(df['dyear'].value_counts()-df['leaving_yr'].value_counts())

#### From the Above  stats we can see that the 2015 has a negative retation on the emlpoyee.But other years we can see that there more new Joinees than leaving employees in the company

#### Yearwise Employees Resignation

In [None]:
k=df.leaving_yr[df['leaving_yr']!='0'].value_counts()
k

### Plot of Date of Leaving Employees

In [None]:
df.leaving_yr[df['leaving_yr']!='0'].value_counts().plot.bar()
plt.xlabel('YEAR')
plt.ylabel('No of Employee Left')


### Genderwise Employee who left the company

In [None]:
sns.countplot(x='leaving_yr',data=df,hue='male',palette = 'magma',saturation=0.75,)
plt.xlabel("Joining Year")
plt.ylabel("Employees Left")
plt.show()

#### From the Above plot we can see that there are more male employees are leaving the jobs than female. But we can also that Females are doing great in the company because their leaving ratio is less than the male employees.

## Hypothesis Testing

In [None]:
from scipy.stats import norm
from scipy.stats import t
import math

### Normalize Salary for Better Visualization

In [None]:
df['n_sal']=df['Salary']/100000

### As the question describe itself that the employees should be fresher's so we have try to compare their graduation year with their joining year

In [None]:
print('Average Salary :')
print('Programmer Analyst :',round(df['n_sal'][(df['GraduationYear']==df['dyear']) & (df['Designation']=='programmer analyst')].mean(), 2))
print('Software Engineer :',round(df['n_sal'][(df['GraduationYear']==df['dyear']) & (df['Designation']=='software engineer')].mean(),2))
print('Hardware Engineer :',round(df['n_sal'][(df['GraduationYear']==df['dyear']) &(df['Designation']=='hardware engineer')].mean(), 2))
print('Associate Engineer :',round(df['n_sal'][(df['GraduationYear']==df['dyear']) &(df['Designation']=='associate engineer')].mean(), 2))

### Sample Data for Required Employees

In [None]:
sample = [3.41,3.6,3.18,3.15]
sample = np.array(sample)

### Necessary variables initialization ex- sample mean

In [None]:
sample_size = len(sample)
sample_mean = np.mean(sample)
sample_mean

#### Sample Standard Devation

In [None]:
sample_std = math.sqrt(sum([(i-sample_mean)**2 for i in sample]) / 3)
print('Sample Standard Deviation :', sample_std)

### Calulating T-Score

In [None]:
def t_score(pop_mean, sample_mean, sample_std, sample_size):
    numerator = sample_mean - pop_mean
    denomenator = sample_std / (sample_size**0.5)
    return numerator / denomenator

### Necessary variables initialization ex- sample mean,population mean


In [None]:
pop_mean = 2.75
sample_mean = 3.34
sample_std = 0.21
sample_size = 4

### Calling T-score Function

In [None]:
t_sc = t_score(pop_mean, sample_mean, sample_std, sample_size)
print('t-score :', t_sc)

#### Setting the Confidence Level

In [None]:
# Two Tail - Deciding the Significance Level & Calculating the t-critical value

confidence_level = 0.95
alpha = 1 - confidence_level

t_critical = t.ppf(1-alpha/2, df = 3)
print('t_critical :', t_critical)

In [None]:
# Visualizing the Sampling Distribution with Rejection Regions

# Defining the x min & x max
x_min = 2
x_max =6

# Defining the Sampling Distribution mean & std
mean = pop_mean
std = sample_std / (sample_size**0.5)

# Ploting the graph and setting the x limits
x = np.linspace(x_min, x_max, 100)
y = norm.pdf(x, mean, std)
plt.xlim(x_min, x_max)
plt.plot(x, y)

# Computing the left and right critical values of Two tailed Test
t_critical_left = pop_mean + (-t_critical * std)
t_critical_right = pop_mean + (t_critical * std)

print('t_critical_left :', t_critical_left)
print('t_critical_right :', t_critical_right)

# Shading the left rejection region
x_left = np.linspace(x_min, t_critical_left, 100)
y_left = norm.pdf(x_left, mean, std)
plt.fill_between(x_left, y_left, color='red')

# Shading the right rejection region
x_right = np.linspace(t_critical_right, x_max, 100)
y_right = norm.pdf(x_right, mean, std)
plt.fill_between(x_right, y_right, color='red')

plt.scatter(sample_mean, 0)
plt.annotate("x_bar", (sample_mean, 0.1))

In [None]:
# Conclusion using t-test

if np.abs(t_sc) > t_critical:
    print("Reject Null Hypothesis")
else:
    print("Fail to reject Null Hypothesis")

In [None]:
# Conclusion using p-test

p_value = 2 * (1.0 - norm.cdf(np.abs(t_sc)))

print("p_value = ", p_value)

if p_value < alpha:
    print("Reject Null Hypothesis")
else:
    print("Fail to reject Null Hypothesis")