In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## 1.1 Import all Libraries

In [51]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import roc_auc_score

## 1.2 EDA 

## Description of the problem
CharityML is a fictitious charity organization located in the heart of Silicon Valley that was established to provide financial support for people eager to learn machine learning. After nearly 32,000 letters were sent to people in the community, CharityML determined that every donation they received came from someone that was making more than $50,000 annually. To expand their potential donor base, CharityML has decided to send letters to residents of California, but to only those most likely to donate to the charity. With nearly 15 million working Californians, CharityML has brought you on board to help build an algorithm to best identify potential donors and reduce overhead cost of sending mail. Your goal will be evaluate and optimize several different supervised learners to determine which algorithm will provide the highest donation yield while also reducing the total number of letters being sent.

In [52]:
df = pd.read_csv("../input/udacity-mlcharity-competition/census.csv")
df.head(20)


In [53]:
df.describe()

In [54]:
df.shape
df.info()

In [55]:
#checking for nan´s 
df.isnull().sum()

In [56]:
df.columns

### Describe the data 
* age: continuous.
* workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked.
* education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool.
* education-num: continuous.
* marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse.
* occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces.
* relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried.
* race: Black, White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other.
* sex: Female, Male.
* capital-gain: continuous.
* capital-loss: continuous.
* hours-per-week: continuous.
* native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

In [57]:
#display age distribution 
plt.hist(df['age'], color = 'green', edgecolor = 'black')

In [58]:
#display maritial status distribution 
plt.hist(df['marital-status'], color = 'green', edgecolor = 'black')
plt.xticks(rotation=90)

In [59]:
plt.hist(df['race'], color = 'green', edgecolor = 'black')
df['race'].value_counts()

In [60]:
df['sex'].value_counts()

In [61]:
df['income'].value_counts()

In [62]:
x = df.groupby(['race', 'income']).size().unstack(fill_value = 0)
display(x)
x.plot()

In [63]:
y = df.groupby(['sex', 'income']).size().unstack(fill_value = 0)
y = df.groupby(['sex', 'income']).agg({'race': 'count'})
display(y)


In [64]:
df['income_encoded'] = [1 if value == ' >50K' else 0 for value in df['income'].values]
df.head()

In [65]:
pd.crosstab(df["occupation"], df['income']).plot(kind='barh', stacked=True, figsize=(20, 10))

In [66]:
pd.crosstab(df["race"], df['income']).plot(kind='barh', stacked=True, figsize=(10, 5))

In [67]:
pd.crosstab(df["education-num"], df['income']).plot(kind='barh', stacked=True, figsize=(10, 5))

In [68]:
y = df.groupby(['sex', 'income']).agg({'race': 'count'})
y

### Conclusion
the data are very unevenly distributed. Most of the people in the dataset are white and male. </br> 
furthermore the income is also very unevely distributed. </br>
for example the white male people earn more than 50 in 31.24 pct of the cases, but women only 11.35 pct
furthermore you can see that as higher the education is as more likely the person earns more than 50k. 


In [69]:
# TODO: Total number of records
n_records = len(df)

# TODO: Number of records where individual's income is more than $50,000
n_greater_50k = len(df[df['income'] == '>50K'])

# TODO: Number of records where individual's income is at most $50,000
n_at_most_50k = len(df[df['income'] == '<=50K'])

# TODO: Percentage of individuals whose income is more than $50,000
greater_percent = (100/n_records)*n_greater_50k

# Print the results
print("Total number of records: {}".format(n_records))
print("Individuals making more than $50,000: {}".format(n_greater_50k))
print("Individuals making at most $50,000: {}".format(n_at_most_50k))
print("Percentage of individuals making more than $50,000: {}%".format(greater_percent))

## 2 Normalizing Numerical Features

In [70]:
# Split the data into features and target label
#target label

income_raw = df['income']
print(income_raw.head())


#feature label
features_raw = df.drop('income', axis = 1)
print(features_raw.head())


### 2.1 The Scale of Your Data Matters
Machine learning models learn a mapping from input variables to an output variable.

As such, the scale and distribution of the data drawn from the domain may be different for each variable.

Input variables may have different units (e.g. feet, kilometers, and hours) that, in turn, may mean the variables have different scales.

Differences in the scales across input variables may increase the difficulty of the problem being modeled. An example of this is that large input values (e.g. a spread of hundreds or thousands of units) can result in a model that learns large weight values. A model with large weight values is often unstable, meaning that it may suffer from poor performance during learning and sensitivity to input values resulting in higher generalization error.

Min-max scaling is a common feature pre-processing technique which results in scaled data values that fall in the range [0,1]. When applied to a Python sequence, such as a Pandas Series, scaling results in a new sequence such that 0 is the minimum value and 1 is the maximum value of the prior unscaled sequence. If the sequence is [1, 2, 3], then the scaled sequence is [0, 0.5, 1].


https://machinelearningmastery.com/standardscaler-and-minmaxscaler-transforms-in-python/ </br>
https://www.kite.com/python/answers/how-to-scale-pandas-dataframe-columns-with-the-scikit-learn-minmaxscaler-in-python


In [74]:
# Log-transform the skewed features
#we now that this two columns are highly skewed. 
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))



In [75]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head(n = 5))

## 3 Data preprocessing
Our machine learning algorithm can not work with categorical features, we need always numerical data. we got two possibilities to arrange that. One is called Label Encoder and the second is called One-Hot-Encoding. The first one gives all our data a number starting from one, two, and so on. But this means we would have some rank in our machine learning algorithm. Because the number one would rank higher than the number one. this can cause a problem. </br>
The second option is that we give all our categorical data a figure between 0 and 1. In the easiest case, we got only two options. For example, we got married and unmarried. Married would get 0 and unmarried would get 0. but if we take divorced also in our data we got already for married 0,0,1 and for unmarried 0,1,0 and for divorced 1,0,0. As you can see this get´s fast a bit confusing. 
In our case we choose One-Hot-encoding. 

In [76]:
# TODO: One-hot encode the 'features_log_minmax_transform' data using pandas.get_dummies()
features_final = pd.get_dummies(features_log_minmax_transform)


# TODO: Encode the 'income_raw' data to numerical values
income = income_raw.apply(lambda x: 0 if x == '<=50K' else 1)

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)
print("{} total features after one-hot encoding.".format(len(encoded)))

# Uncomment the following line to see the encoded feature names
print(encoded)