...done using [Markdown Cheat Sheet](https://gtribello.github.io/mathNET/assets/notebook-writing.html) and [Unofficial Jupyter Extensions](https://jupyter-contrib-nbextensions.readthedocs.io/en/latest/index.html)

# Imports and Installations

In [1]:
import os
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns

# Functions

In [2]:
# get data as dataframe
def read_csv():
    df = pd.read_csv("C:/Users/Andrea/PycharmProjects/datascience/data/StackOverflow2020/survey_results_public.csv")  # df with answers
    schema = pd.read_csv("C:/Users/Andrea/PycharmProjects/datascience/data/StackOverflow2020/survey_results_schema.csv")  # df with questions
    return df, schema
df, schema = read_csv()

In [12]:
# get description of a column (desc) as string by providing the column name (column_name)
def get_description(column_name, schema=schema):
    desc = schema.set_index('Column').loc[f'{column_name}']['QuestionText']
    return desc

In [17]:
# get the percentage of NaN values (perc_nan) and the number of columns with more than x percent of values missing (perc_nan_over_x) in df
def get_nan_perc(df, threshold):
    # columns with corresponding percentage of nan values
    perc_nan = df.isnull().sum()/len(df)
    # columns with more than x percent of nan values
    perc_nan_over_x = (df.isnull().sum()/len(df)) > threshold
    # number of columns
    number_over_x = np.sum(perc_nan > threshold)
    return perc_nan, perc_nan_over_x, number_over_x

# Load & View Data

In [36]:
df, schema = read_csv()
df.head()

Unnamed: 0,Respondent,MainBranch,Hobbyist,Age,Age1stCode,CompFreq,CompTotal,ConvertedComp,Country,CurrencyDesc,...,SurveyEase,SurveyLength,Trans,UndergradMajor,WebframeDesireNextYear,WebframeWorkedWith,WelcomeChange,WorkWeekHrs,YearsCode,YearsCodePro
0,1,I am a developer by profession,Yes,,13,Monthly,,,Germany,European Euro,...,Neither easy nor difficult,Appropriate in length,No,"Computer science, computer engineering, or sof...",ASP.NET Core,ASP.NET;ASP.NET Core,Just as welcome now as I felt last year,50.0,36,27.0
1,2,I am a developer by profession,No,,19,,,,United Kingdom,Pound sterling,...,,,,"Computer science, computer engineering, or sof...",,,Somewhat more welcome now than last year,,7,4.0
2,3,I code primarily as a hobby,Yes,,15,,,,Russian Federation,,...,Neither easy nor difficult,Appropriate in length,,,,,Somewhat more welcome now than last year,,4,
3,4,I am a developer by profession,Yes,25.0,18,,,,Albania,Albanian lek,...,,,No,"Computer science, computer engineering, or sof...",,,Somewhat less welcome now than last year,40.0,7,4.0
4,5,"I used to be a developer by profession, but no...",Yes,31.0,16,,,,United States,,...,Easy,Too short,No,"Computer science, computer engineering, or sof...",Django;Ruby on Rails,Ruby on Rails,Just as welcome now as I felt last year,,15,8.0


In [37]:
df.shape

(64461, 61)

In [13]:
get_description('MainBranch')

'Which of the following options best describes you today? Here, by "developer" we mean "someone who writes code."'

In [33]:
perc_nan, perc_nan_over_x, number_over_x = get_nan_perc(df, .5)

In [34]:
perc_nan

Respondent            0.000000
MainBranch            0.004638
Hobbyist              0.000698
Age                   0.294985
Age1stCode            0.101782
                        ...   
WebframeWorkedWith    0.344115
WelcomeChange         0.182715
WorkWeekHrs           0.361614
YearsCode             0.105133
YearsCodePro          0.280976
Length: 61, dtype: float64

In [35]:
perc_nan_over_x

Respondent            False
MainBranch            False
Hobbyist              False
Age                   False
Age1stCode            False
                      ...  
WebframeWorkedWith    False
WelcomeChange         False
WorkWeekHrs           False
YearsCode             False
YearsCodePro          False
Length: 61, dtype: bool

In [32]:
number_over_x

0

# Data Preparation

# ...the Real Questions