In [None]:
# important libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# loading my cleaned data from step 2.
df = pd.read_csv('cleaned_data.csv')
df.head()

In [None]:
df.columns

In [None]:
## job title and senority
## picking up some important job titles related to data science from my data.

def title_simplifier(title):
    if 'data scientist' in title.lower():
        return 'data scientist'
    if 'data engineer' in title.lower():
        return 'data engineer'
    if 'analyst' in title.lower():
        return 'analyst'
    if 'machine learning' in title.lower():
        return 'mle'
    if 'manager' in title.lower():
        return 'manager'
    if 'director' in title.lower():
        return 'director'
    else:
        return 'na'

# Which jobs requires senior level exp.
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr' in title.lower() or 'lead' in title.lower() or 'principal' in title.lower():
        return 'senior'
    elif 'jr' in title.lower() or 'jr.' in title.lower():
        return 'jr'
    else:
        return 'na'

In [None]:
## creating a new column in df.
df['job_simp'] = df['Job Title'].apply(title_simplifier)
df.sample(10)

In [None]:
df.job_simp.value_counts()

In [None]:
df['seniority'] = df['Job Title'].apply(seniority)
df.seniority.value_counts()

In [None]:
df['desc_len'] = df['Job Description'].apply(lambda x: len(x))
df['desc_len']

In [None]:
## find the number of competitors for each company

df['num_comp'] = df.Competitors.apply(lambda x: len(x.split(',')) if x != '-1' else 0)
df.num_comp.value_counts()

In [None]:
df[df.hourly==1][['hourly','min_salary','max_salary']]

In [None]:
# hourly wage to annual 

df['min_salary'] = df.apply(lambda x: x.min_salary*2 if x.hourly == 1 else x.min_salary,axis=1)
df['max_salary'] = df.apply(lambda x: x.max_salary*2 if x.hourly == 1 else x.max_salary, axis=1)

In [None]:
df[df.hourly==1][['hourly','min_salary','max_salary']]

In [None]:
#remove new line from job title

df['company_txt'] = df.company_txt.apply(lambda x: x.replace('\n',''))

In [None]:
df.company_txt.sample(5)

In [None]:
# ploting the histogram of each feature. This step gives a lot of information about the data features.
df.hist(bins=30, figsize=(20,20),color='g')

In [None]:
df.describe()

In [None]:
# oldest company
df[df['age']==276]

In [None]:
## company proving lowest salary
df[df['min_salary']==15]

In [None]:
## This is the most important plot as it shows average salary for each data related fields.
plt.figure(figsize=(12,8))
sns.boxplot(x='avg_salary',y='job_simp',data=df)

In [None]:
plt.figure(figsize=(14,12))
sns.boxplot(x='avg_salary',y='job_state',data=df)

In [None]:
# correlation plot.
correlations = df.corr()
f,ax = plt.subplots(figsize=(15,15))
sns.heatmap(correlations, annot=True, cmap="YlGnBu", linewidths=.5)

In [None]:
cat_var = df.select_dtypes(exclude=['int', 'float']).columns #cat Variable

numeric_var = df.select_dtypes(include=['int', 'float']).columns # numeric Varible

In [None]:
numeric_var

In [None]:
cat_var

In [None]:
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(df[['age','avg_salary','Rating','desc_len','num_comp']].corr(),vmax=.3, center=0, cmap=cmap,annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
# picking up some important categorical features for analyses.
df_cat = df[['Location', 'Headquarters', 'Size','Type of ownership', 'Industry', 'Sector', 'Revenue', 'company_txt', 'job_state','Same_Location_as_HQ', 'python_jd', 'R_jd',
       'spark_jd', 'aws_jd', 'excel_jd', 'job_simp', 'seniority']]

In [None]:
# ploting bar graphs for all the important features.
for i in df_cat.columns:
    cat_num = df_cat[i].value_counts()
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
# some graphs above are not clear. Hence I pick up only the top 20 for such features.
for i in df_cat[['Location','Headquarters','company_txt','Industry']].columns:
    cat_num = df_cat[i].value_counts()[:20]
    print("graph for %s: total = %d" % (i, len(cat_num)))
    chart = sns.barplot(x=cat_num.index, y=cat_num)
    chart.set_xticklabels(chart.get_xticklabels(), rotation=90)
    plt.show()

In [None]:
# avg salary as per job positions.
pd.pivot_table(df,index='job_simp', values='avg_salary').sort_values('avg_salary', ascending = False)

In [None]:
# avg salary in each state.
pd.pivot_table(df,index='job_state',values='avg_salary').sort_values('avg_salary', ascending = False)

In [None]:
# avg salary as per different sectors.
pd.pivot_table(df,index='Sector',values='avg_salary').sort_values('avg_salary', ascending = False)

In [None]:
#avg salary as per company renevue
pd.pivot_table(df,index='Revenue',values='avg_salary').sort_values('avg_salary', ascending = False)

In [None]:
# avg salary for a data scientist as per states.
pd.pivot_table(df[df.job_simp == 'data scientist'], index = 'job_state', values = 'avg_salary').sort_values('avg_salary', ascending = False)

In [None]:
df_pivots = df[['Rating', 'Industry', 'Sector', 'Revenue', 'num_comp', 'hourly', 'employer_provided', 'python_jd', 'R_jd', 'spark_jd', 'aws_jd', 'excel_jd', 'Type of ownership','avg_salary']]

In [None]:
# pivot tables for some important features with respect to avg_salary.
for i in df_pivots.columns:
    print(i)
    print(pd.pivot_table(df_pivots, index = i, values= 'avg_salary').sort_values('avg_salary',ascending= False))

In [None]:
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:

words = " ".join(df['Job Description'])

def punctuation_stop(text):
    """remove punctuation and stop words"""
    filtered = []
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    for w in word_tokens:
        if w not in stop_words and w.isalpha():
            filtered.append(w.lower())
    return filtered


words_filtered = punctuation_stop(words)

text = " ".join([ele for ele in words_filtered])

wc= WordCloud(background_color="white", random_state=1,stopwords=STOPWORDS, max_words = 2000, width =800, height = 1500)
wc.generate(text)

plt.figure(figsize=[15,30])
plt.imshow(wc,interpolation="bilinear")
plt.axis('off')
plt.show()