In [124]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pylab as p
from sklearn.cross_validation import KFold
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

# Tree-based feature selection
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectPercentile, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold



redundant_features = []
useful_features = []

def import_data():
    # For .read_csv, always use header=0 when you know row 0 is the header row
    df = pd.read_csv("./data/ElectionsData-full.csv", header=0)

#     df['split'] = 0
#     indices = KFold(n=len(df), n_folds=5, shuffle=True)._iter_test_indices()
#     df['split'][indices.next()] = 1
#     df['split'][indices.next()] = 2
#     raw_data = df.copy()

#     raw_data[raw_data['split'] == 0].drop('split', axis=1).to_csv('./data/output/raw_train.csv', index=False, sep=',')
#     raw_data[raw_data['split'] == 1].drop('split', axis=1).to_csv('./data/output/raw_test.csv', index=False, sep=',')
#     raw_data[raw_data['split'] == 2].drop('split', axis=1).to_csv('./data/output/raw_validation.csv', index=False)

    return df


def export_transformed_data(_df):
    _df[_df['split'] == 0].drop('split', axis=1).to_csv('./data/output/processed_train.csv', index=False)
    _df[_df['split'] == 1].drop('split', axis=1).to_csv('./data/output/processed_test.csv', index=False)
    _df[_df['split'] == 2].drop('split', axis=1).to_csv('./data/output/processed_validation.csv', index=False)


def group_features(_df):
    _df = _df.dropna()
    all_features = _df.drop(['Vote', 'split'], axis=1).columns
    categorical_features = _df.drop(['Vote', 'split'], axis=1).select_dtypes(include=["object"])
    numeric_features = _df.drop(['Vote', 'split'], axis=1).select_dtypes(exclude=["object"])

    return [all_features, categorical_features, numeric_features]


def fill_numeric_features(_df, features):
    for f in features:
        _df[f].fillna(_df[f].median(), inplace=True)


def fill_categorical_features(_df, features):
    for f in features:
        _df[f].fillna(_df[f].value_counts().idxmax(), inplace=True)


def transform_categorical_features(_df, features):
    for f in features:
        _df[f] = _df[f].astype("category")
        _df[f + "_Int"] = _df[f].cat.rename_categories(range(_df[f].nunique())).astype(int).astype(float)
        _df.loc[_df[f].isnull(), f + "_Int"] = np.nan  # fix NaN conversion3


def transform_label(_df, label):
    _df[label] = _df[label].astype("category").cat.rename_categories(range(_df[label].nunique())).astype(int).astype(float)

def outliar_detection(_df, features):
    # Outliar detection
    threshold = 3
    for f in numeric_features:
        std = _df[f].std()
        mean = _df[f].mean()
        _df = _df[_df[f].between(mean - threshold * std, mean + threshold * std)]
    return _df

def scale_numeric(_df, features):
    for f in features:
        _df[f] = (_df[f] - _df[f].min()) / (_df[f].max() - _df[f].min())


def transform_bool(_df, name):
    _df[name] = _df[name].map({'No': -1, "Maybe": 0, 'Yes': 1}).astype(int).astype(float)

def transform_category(_df, name):
    redundant_features.append(name)
    for cat in df[name].unique():
        _df["Is_" + name + "_" + cat] = (_df[name] == cat).astype(int).astype(float)
    del _df[name]

        
def transform_manual(_df):
    _df["Age_group"] = _df["Age_group"].map({'Below_30': 0, '30-45': 1, '45_and_up': 2}).astype(float)
    _df["Voting_Time"] = _df["Voting_Time"].map({'By_16:00': 0, 'After_16:00': 1}).astype(float)
    _df["Gender"] = _df["Gender"].map({'Male': -1, 'Female': 1}).astype(float)
    
    
    transform_bool(_df, "Looking_at_poles_results")
    transform_bool(_df, "Married")
    transform_bool(_df, "Financial_agenda_matters")
    transform_bool(_df, "Will_vote_only_large_party")
    transform_category(_df, "Most_Important_Issue")
    transform_category(_df, "Occupation")
    transform_category(_df, "Main_transportation")

def to_np_array(_df):
    df_data_X = _df.drop(['split','Vote'], axis=1).values
    df_data_Y = _df.Vote.values
    features_list = _df.drop(['split','Vote'], axis=1).columns
    return [df_data_X, df_data_Y, features_list]
        
def variance_filter(data_X, features_list):
    varsel = VarianceThreshold(threshold=0.01)
    varsel.fit_transform(data_X)
    featsel_idx = varsel.get_support()
    print 'Removing features with low variance - ', '\t', list(features_list[~featsel_idx])
    return list(features_list[~featsel_idx])
        
def select_features_with_rfe(data_X, data_Y, feature_names):
    result = []
    
    svc = SVC(kernel="linear", C=1)
    rfecv = RFECV(estimator=svc, step=1, cv=3, scoring='accuracy')
    rfecv.fit(data_X, data_Y)

    print("RFE - Optimal number of features : %d" % rfecv.n_features_)
    
    for idx, val in enumerate(rfecv.get_support()):
        if val:
            print "RFE - Choosing feature: " + feature_names[idx]
            result.append(feature_names[idx]) 
    return result


def select_features_with_rfe_with_stratified_k_fold(data_X, data_Y, feature_names):
    result = []
    
    svc = SVC(kernel="linear", C=1)
    rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy')
    rfecv.fit(data_X, data_Y)

    print("RFE stratified_k - Optimal number of features : %d" % rfecv.n_features_)
    
    for idx, val in enumerate(rfecv.get_support()):
        if val:
            print "RFE stratified_k - Choosing feature: " + feature_names[idx]
            result.append(feature_names[idx]) 
    return result

def univariate_features_with_mi(data_X, data_Y, feature_names):
    result = []
    
    selector = SelectPercentile(mutual_info_classif, percentile=25)
    selector.fit(data_X, data_Y)

    for idx, val in enumerate(selector.get_support()):
        if val:
            result.append(feature_names[idx]) 
            print "MI - Choosing feature: " + feature_names[idx]
    
    return result



def univariate_features_with_f_classif(data_X, data_Y, feature_names):
    result = []
    
    selector = SelectPercentile(f_classif, percentile=25)
    selector.fit(data_X, data_Y)

    for idx, val in enumerate(selector.get_support()):
        if val:
            result.append(feature_names[idx]) 
            print "f-classif - Choosing feature: " + feature_names[idx]
    
    return result

    
def univariate_features_with_f_classif(data_X, data_Y, feature_names):
    result = []
    
    selector = SelectPercentile(f_classif, percentile=25)
    selector.fit(data_X, data_Y)

    for idx, val in enumerate(selector.get_support()):
        if val:
            result.append(feature_names[idx]) 
            print "f-classif - Choosing feature: " + feature_names[idx]
    
    return result
     
def embedded_features_by_descision_tree(data_X, data_Y, feature_names):    
    result = []

    clf = ExtraTreesClassifier()
    clf = clf.fit(data_X, data_Y)
    tree_weights = clf.feature_importances_  
    tree_weights /= tree_weights.max()
    tree_booleans = tree_weights > np.percentile(tree_weights, 75)
    for idx, val in enumerate(tree_booleans):
        if val:
            result.append(feature_names[idx]) 
            print "Tree Clasifier - Choosing feature: " + feature_names[idx]
    
    return result
    
# redundant_features = []
# useful_features = []

df = import_data()

all_features, categorical_features, numeric_features = group_features(df)

fill_numeric_features(df, numeric_features)
fill_categorical_features(df, categorical_features)
# transform_categorical_features(df, categorical_features)  # We Don't need that!!
transform_label(df, "Vote")
transform_manual(df)

scale_numeric(df, numeric_features)
print "Before outliar detacction: " + str(df.shape[0])
df = outliar_detection(df, numeric_features)
print "After outliar detacction: " + str(df.shape[0])

df.info()


df_data_X, df_data_Y, features_list = to_np_array(df)
df_data_X_scaled = preprocessing.scale(df_data_X)

features_to_exclude = variance_filter(df_data_X_scaled, features_list)
redundant_features.extend(features_to_exclude)





ValueError: labels ['split'] not contained in axis

In [119]:
#############################################
good_features = select_features_with_rfe(df_data_X, df_data_Y, features_list)
useful_features.extend(good_features)

good_features = univariate_features_with_mi(df_data_X, df_data_Y, features_list)
useful_features.extend(good_features)

good_features = univariate_features_with_f_classif(df_data_X, df_data_Y, features_list)
useful_features.extend(good_features)

good_features = embedded_features_by_descision_tree(df_data_X, df_data_Y, features_list)
useful_features.extend(good_features)

good_features = select_features_with_rfe_with_stratified_k_fold(df_data_X, df_data_Y, features_list)
useful_features.extend(good_features)

#############################################
useful_features = list(set(useful_features))
useful_features

RFE - Optimal number of features : 23
RFE - Choosing feature: AVG_lottary_expanses
RFE - Choosing feature: Avg_Satisfaction_with_previous_vote
RFE - Choosing feature: Looking_at_poles_results
RFE - Choosing feature: Garden_sqr_meter_per_person_in_residancy_area
RFE - Choosing feature: Married
RFE - Choosing feature: Yearly_IncomeK
RFE - Choosing feature: Avg_monthly_expense_on_pets_or_plants
RFE - Choosing feature: Avg_monthly_household_cost
RFE - Choosing feature: Will_vote_only_large_party
RFE - Choosing feature: Phone_minutes_10_years
RFE - Choosing feature: Avg_size_per_room
RFE - Choosing feature: Weighted_education_rank
RFE - Choosing feature: Last_school_grades
RFE - Choosing feature: Political_interest_Total_Score
RFE - Choosing feature: Number_of_valued_Kneset_members
RFE - Choosing feature: Overall_happiness_score
RFE - Choosing feature: Is_Most_Important_Issue_Environment
RFE - Choosing feature: Is_Most_Important_Issue_Social
RFE - Choosing feature: Is_Most_Important_Issue_M

['Is_Most_Important_Issue_Other',
 'Is_Most_Important_Issue_Financial',
 'Yearly_IncomeK',
 'Number_of_valued_Kneset_members',
 'Will_vote_only_large_party',
 'AVG_lottary_expanses',
 'Avg_monthly_household_cost',
 'Phone_minutes_10_years',
 'Is_Most_Important_Issue_Military',
 'Looking_at_poles_results',
 'Overall_happiness_score',
 'Married',
 'Is_Most_Important_Issue_Environment',
 'Garden_sqr_meter_per_person_in_residancy_area',
 'Is_Most_Important_Issue_Education',
 'Avg_size_per_room',
 'Is_Most_Important_Issue_Foreign_Affairs',
 'Political_interest_Total_Score',
 'Last_school_grades',
 'Avg_Satisfaction_with_previous_vote',
 'Weighted_education_rank',
 'Is_Most_Important_Issue_Social',
 'Avg_monthly_expense_on_pets_or_plants']

In [121]:
good_features = select_features_with_rfe_with_stratified_k_fold(df_data_X, df_data_Y, features_list)
useful_features.extend(good_features)
good_features

RFE stratified_k - Optimal number of features : 23
RFE stratified_k - Choosing feature: AVG_lottary_expanses
RFE stratified_k - Choosing feature: Avg_Satisfaction_with_previous_vote
RFE stratified_k - Choosing feature: Looking_at_poles_results
RFE stratified_k - Choosing feature: Garden_sqr_meter_per_person_in_residancy_area
RFE stratified_k - Choosing feature: Married
RFE stratified_k - Choosing feature: Yearly_IncomeK
RFE stratified_k - Choosing feature: Avg_monthly_expense_on_pets_or_plants
RFE stratified_k - Choosing feature: Avg_monthly_household_cost
RFE stratified_k - Choosing feature: Will_vote_only_large_party
RFE stratified_k - Choosing feature: Phone_minutes_10_years
RFE stratified_k - Choosing feature: Avg_size_per_room
RFE stratified_k - Choosing feature: Weighted_education_rank
RFE stratified_k - Choosing feature: Last_school_grades
RFE stratified_k - Choosing feature: Political_interest_Total_Score
RFE stratified_k - Choosing feature: Number_of_valued_Kneset_members
RFE 

['AVG_lottary_expanses',
 'Avg_Satisfaction_with_previous_vote',
 'Looking_at_poles_results',
 'Garden_sqr_meter_per_person_in_residancy_area',
 'Married',
 'Yearly_IncomeK',
 'Avg_monthly_expense_on_pets_or_plants',
 'Avg_monthly_household_cost',
 'Will_vote_only_large_party',
 'Phone_minutes_10_years',
 'Avg_size_per_room',
 'Weighted_education_rank',
 'Last_school_grades',
 'Political_interest_Total_Score',
 'Number_of_valued_Kneset_members',
 'Overall_happiness_score',
 'Is_Most_Important_Issue_Environment',
 'Is_Most_Important_Issue_Social',
 'Is_Most_Important_Issue_Military',
 'Is_Most_Important_Issue_Financial',
 'Is_Most_Important_Issue_Education',
 'Is_Most_Important_Issue_Other',
 'Is_Most_Important_Issue_Foreign_Affairs']

In [125]:
df = import_data()
cor = df.corr()
indices = np.where(cor > 0.95)
indices = [(cor.index[x], cor.columns[y]) for x, y in zip(*indices) if x != y and x < y]
indices

[('Avg_monthly_expense_when_under_age_21',
  'Avg_Satisfaction_with_previous_vote'),
 ('Garden_sqr_meter_per_person_in_residancy_area',
  'Avg_monthly_expense_on_pets_or_plants'),
 ('Garden_sqr_meter_per_person_in_residancy_area', 'Phone_minutes_10_years'),
 ('Yearly_IncomeK', 'Avg_monthly_household_cost'),
 ('Yearly_IncomeK', 'Avg_size_per_room'),
 ('Avg_monthly_expense_on_pets_or_plants', 'Phone_minutes_10_years'),
 ('Avg_monthly_household_cost', 'Political_interest_Total_Score')]

In [140]:

def fill_f1_with_f2(_df, f1, f2):
    ratio = _df[f1].mean() / _df[f2].mean()
    print "ration betweed " + f1 + " and " + f2 + "is " + str(ratio)
    for index, row in _df[_df[f1].isnull()].iterrows():
        if ~np.isnan(_df[f2][index]):
            _df.loc[f1][index] = _df[f2][index] * ratio 

        
for pair in indices:
    dfs = df[[pair[0],pair[1]]].copy()
    dfs["x"] = df[pair[0]] / df[pair[1]]
    dfs["y"] = df[pair[0]].mean() / df[pair[1]].mean()
    print dfs.head(10)
    fill_f1_with_f2(df, pair[0], pair[1])


   Avg_monthly_expense_when_under_age_21  Avg_Satisfaction_with_previous_vote  \
0                             520.295896                           173.431965   
1                             695.412346                           231.804115   
2                             102.471366                                  NaN   
3                             392.898135                           130.966045   
4                            1825.062431                           608.354144   
5                             540.959917                           180.319972   
6                             295.794850                            98.598283   
7                              60.152418                            20.050806   
8                             217.746310                            72.582103   
9                            -284.978409                           -94.992803   

     x         y  
0  3.0  3.001356  
1  3.0  3.001356  
2  NaN  3.001356  
3  3.0  3.001356  
4  3.0  3.001

In [111]:
df["x"]
# df[df["x"] > 1.000001].count()
# df[df["x"] < 0.999999].count()

0       3.0
1       3.0
2       NaN
3       3.0
4       3.0
5       3.0
6       3.0
7       3.0
8       3.0
9       3.0
10      3.0
11      3.0
12      3.0
13      3.0
14      3.0
15      3.0
16      3.0
17      3.0
18      3.0
19      3.0
20      3.0
21      3.0
22      3.0
23      3.0
24      3.0
25      3.0
26      3.0
27      3.0
28      3.0
29      3.0
       ... 
9970    3.0
9971    3.0
9972    3.0
9973    NaN
9974    3.0
9975    3.0
9976    3.0
9977    3.0
9978    3.0
9979    3.0
9980    3.0
9981    3.0
9982    3.0
9983    3.0
9984    3.0
9985    3.0
9986    3.0
9987    3.0
9988    3.0
9989    3.0
9990    3.0
9991    3.0
9992    3.0
9993    3.0
9994    3.0
9995    3.0
9996    3.0
9997    3.0
9998    3.0
9999    3.0
Name: x, Length: 10000, dtype: float64