## Exploratory Data Analysis II

IT Salary Survey EU  2020

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('datasets\IT Salary Survey EU  2020.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df.drop(columns = ['Annual bonus+stocks one year ago. Only answer if staying in same country',
                        'Have you lost your job due to the coronavirus outbreak?', 
                        'Have you been forced to have a shorter working week (Kurzarbeit)? If yes, how many hours per week',
                        'Have you received additional monetary support from your employer due to Work From Home? If yes, how much in 2020 in EUR',
                        'Timestamp',
                        'Years of experience in Germany',
                        'Annual brutto salary (without bonus and stocks) one year ago. Only answer if staying in the same country',
                        'Annual bonus+stocks one year ago. Only answer if staying in same country','Yearly bonus + stocks in EUR'])

In [None]:
df.rename(columns = {'Your main technology / programming language': 'Main technology', 
                     'Position ': 'Position',
                     'Other technologies/programming languages you use often': 'Other technologies', 
                     'Yearly brutto salary (without bonus and stocks) in EUR': 'Yearly salary', 
                     'Number of vacation days': 'Vacation days', 
                     'Main language at work': 'Language at work',
                     'Yearly bonus + stocks in EUR':'Yearly bonus',
                     'Total years of experience': 'Years of experience' }, inplace = True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Age'] = df['Age'].fillna(0).astype(int)
df['Yearly salary'] = df['Yearly salary'].astype(int)

In [None]:
df['Vacation days'] = pd.to_numeric(df['Vacation days'], errors = 'coerce').fillna(0).astype(int)
df['Years of experience'] = pd.to_numeric(df['Years of experience'], errors = 'coerce').fillna(0).astype(int)

Outliers detection

In [None]:

fig = plt.figure(figsize= (15, 8))
plt.title('The distribution of the values:', fontsize= 16)

axs = fig.subplots(nrows=2, ncols= 2)

sns.boxplot(data = df, x= 'Age', ax = axs[0,0])
sns.boxplot(data = df, x= 'Years of experience',ax = axs[0,1])
sns.boxplot(data = df, x= 'Yearly salary',ax = axs[1,0])
sns.boxplot(data = df, x= 'Vacation days', ax= axs[1,1])
plt.show()

In [None]:
df['Yearly salary'].sort_values()

In [None]:
# delete the exterme outliers by their index

df = df.drop(index = [473, 853])

In [None]:
df['Yearly salary'].sort_values()

Data Visualization

In [None]:
df['Age'].sort_values()

In [None]:
drop_index = (df[df['Age']==0].index).to_list()

In [None]:
df = df.drop(index = drop_index)

In [None]:
plt.figure(figsize=(18,6))
sns.countplot(data= df, x='Age')
plt.title('The distribution of the ages of the participants:', fontsize= 16)
plt.xlabel('Age', fontsize= 14)
plt.ylabel('Count', fontsize = 14)
plt.show()

In [None]:
age_salary = df.groupby('Age', as_index = False)['Yearly salary'].median()
plt.figure(figsize=(14, 6))
sns.barplot(data= age_salary, x= 'Age', y= 'Yearly salary',palette= 'flare')
plt.title('The Average salary for each age:', fontsize= 16)
plt.xlabel('Age', fontsize= 14)
plt.ylabel('Yearly Salary', fontsize = 14)
plt.xticks(fontsize= 12)
plt.show()


In [None]:
plt.figure(figsize=(16,8))
sns.boxplot(data= df, x= df['Gender'], y= df['Age'], palette= 'deep')
plt.title('The gender and the age:', fontsize = 16)
plt.xlabel('')
plt.ylabel('Age', fontsize= 16)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
plt.show()

In [None]:
plt.figure(figsize=(18,6))
sns.histplot(data= df, x= 'Yearly salary', hue= 'Gender', palette= 'magma', kde= True, bins = 30)
plt.title('The distribution of the yearly salary:',fontsize= 16)
plt.xlabel('Yearly Salary', fontsize= 14)
plt.ylabel('Count', fontsize= 14)
plt.xticks(fontsize= 12)
plt.yticks(fontsize = 12)
plt.show()

In [None]:
df['Seniority level'].value_counts()

In [None]:
df = df.replace(['javascript','Javascript', 'js', 'JS'], 'JavaScript')
df = df.replace(['python', 'Python '], 'Python')
df = df.replace('c++', 'C++')
df = df.replace(['.net', '.Net'], '.NET')
df = df.replace(['Sql', 'sql'], 'SQL')
df = df.replace(['php','Php'], 'PHP')
df = df.replace('java', 'Java')
df = df.replace(['typescript', 'Typescript'], 'TypeScript')

In [None]:
# top 10 programming languages and technologies:

plt.figure(figsize=(16, 6))
ax = df['Main technology'].value_counts().head(10).plot(kind = 'bar', color = 'royalblue')
plt.title('Top 10 used programming languages and technologies:', fontsize = 20)
plt.ylabel('Count', fontsize = 16)
plt.xticks(fontsize = 14, rotation = 45)
plt.bar_label(ax.containers[0], label_type = 'center',fontsize = 16, rotation = 90, color = 'w')
plt.show()

In [None]:
salary_language = df.loc[df['Main technology'].isin(['Python', 'Java', 'JavaScript', 'PHP', 'C++', 'Scala', 'Swift', '.NET', 'Go'])]
plt.figure(figsize = (16,8))
sns.boxplot(data = salary_language, x = 'Main technology', y= 'Yearly salary', hue = 'Gender', palette = 'viridis')
plt.title('The relationship between the top used programming languages , gender and the yearly salary:', fontsize = 16)
plt.xlabel('')
plt.ylabel('Yearly salary', fontsize= 14)
plt.xticks(fontsize = 12)
plt.yticks(fontsize = 12)
plt.show()