In [15]:
import pandas as pd
import numpy as np
import pickle


# Functions

In [145]:
def moving_average(average,new,number):
    return (average*number + new)/(number +1)

"""
values: numpy array of aggregatable values
"""
def aggregate(values):
    
    aggregates = np.zeros(values.shape)
    aggregate = np.zeros(values.shape[1])

    for date_order  in range(values.shape[0]):
        aggregate = moving_average(aggregate,values[date_order],date_order)
        aggregates[date_order] = aggregate
    return aggregates

# make a new function
def final_aggregate(values):
    aggregate = np.zeros(values.shape[1])
    for date_order  in range(values.shape[0]):
        aggregate = moving_average(aggregate,values[date_order],date_order)
    return aggregate

def aggregated_data(ID: str, agg_index: int, features: list, raw : pd.core.frame.DataFrame  ) -> np.ndarray:
    
    raw = raw[features]

    IDs = set(raw[ID].tolist()) # individual ids

    num_cols = len(raw.columns)
    data = np.empty((0, num_cols))

    for person in IDs:
        values = raw[raw[ID] == person].sort_values(by=['dateorder']).values.astype("float")
        agg_values = values[:,agg_index:]
        unagg_values = values[:,:agg_index]
        person_values = np.concatenate((unagg_values,aggregate(agg_values)), axis = 1)
        data = np.concatenate((data,person_values), axis = 0 )
        
    return data   

def aggregate_data2(ID: str, agg_index: int, features: list, raw : pd.core.frame.DataFrame  ) -> np.ndarray:
    id_index = dict ( maleid = 0, femaleid = 1)
    
    raw = raw[features]

    IDs = set(raw[ID].tolist()) # individual ids

    length = len(raw.columns[agg_index:]) + 1 #+ id
    
    aggre_data = np.empty((0,length))
    
    index = id_index[ID] 
    
    for person in IDs:
        values = raw[raw[ID] == person].sort_values(by=['dateorder']).values.astype("float")
        agg_values = values[:,agg_index:]
        aggre = np.zeros((1,length))
        aggre[0,0] = values[0, index]
        aggre[0,1:] = final_aggregate(agg_values)
        aggre_data = np.concatenate((aggre_data, aggre), axis = 0 )
        
    return aggre_data

def preliminaries(PATH):
    missing_values = [" ", ""]
    # replaced missing values with zeros
    # consider maybe removing those rows if time permits you

    raw = pd.read_csv(PATH, na_values = missing_values)
    #NOTE: this needs revision This is only particular to Sample B
    raw.loc[:,['Liked', 'SexAtt', "LikYes", 'FSexAtt']] = raw.loc[:,['Liked', 'SexAtt', "LikYes", 'FSexAtt']].fillna(1)
    raw.loc[raw["maleID"] == 219,"Liked":"Common"] # this guy did not enter any values for dependent measures... so i a gave him a 1
    raw = raw.fillna(0)
    return raw


# Preliminaries

In [140]:


dependent_measures = ['liked', 'sexatt', "likyes", 'fliked', 'fsexatt', "flikyes"]
measures = dependent_measures.copy()
measures.extend(["saidyes","fsaidyes","match"])

# this will be changed
features = dict(
    non_aggregated = ['maleid', 'femaleid', 'dateorder'],  # non aggregated features
    
     both = ['inter1', 'deplet1', 'pliked', 'psexatt',
            'finter1', 'fdeplet1', 'fpliked', 'fpsexatt', ], # Both
    
    individual = ['common', 'simper','connect'], # One sided individual view   
    
    

    # one sided partners view # note i removed some features to as to fit sample A
    partner = ['pi1', 'pi2', 'pi3', 'pi4', 'pi5', 'pi6', 'pi7', 'pi8',  
             'pi9', 'pi10', 'pi11', 'ip12', 'if1', 'if2', 'if3',
             'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12']
)


#NOTE encode some categorical data and put in nonaggregated
#check this https://pbpython.com/categorical-encoding.html


PATH = "/Users/oluwaseuncardoso/492_19summer/MachineLearning/RAW DATA/raw.csv"
PATH2 = "/Users/oluwaseuncardoso/492_19summer/MachineLearning/RAW DATA/raw2.csv"
missing_values = [" ", ""]
raw1 = preliminaries(PATH)
raw1.columns = [x.lower() for x in raw1.columns]
raw1.rename(columns={"if34":"if12","ip34": "ip12", "fif34":"fif12","fip34": "fip12" }, inplace = True)

raw2 = pd.read_csv(PATH2, na_values = missing_values)
raw2 = raw2.fillna(0)

raw2.columns = [x.lower() for x in raw2.columns]

pmale_features = []    #perceived male features
for feature in features['partner']:
    pmale_features.append("f"+feature)  

pfemale_features = [] #perceived female features
for feature in features['individual']:
    pfemale_features.append("f"+feature)  
    
male_features = features['non_aggregated'] + features['both'] + features['individual'] + pmale_features

female_features = features['non_aggregated'] + features['both']  + pfemale_features + features['partner']


 
total_features = features['non_aggregated']+ features['both'] + features['individual'] \
                + pfemale_features + pmale_features  + features['partner'] \
                + measures

raw1 = raw1[total_features]
raw2 = raw2[total_features]
frames = [raw1,raw2]
raw = pd.concat(frames)

# Dependent Measures

In [137]:
measures = features['non_aggregated'] + measures
dependent_df = raw[measures]

# Male aggregate

In [142]:
agg_index = len(features['non_aggregated'])
# Type 1
male_data = aggregated_data('maleid', agg_index, male_features, raw )
male_df1 = pd.DataFrame(male_data, columns = male_features)

In [150]:
# Type 2
male_data2 = aggregate_data2('maleid', agg_index, male_features, raw )
feature_cols = male_features[agg_index:].copy()
feature_cols.insert(0,"maleid")
male_df2 = pd.DataFrame(male_data2, columns = feature_cols)

# Female aggregate

In [151]:
#type 1
female_data = aggregated_data('femaleid', agg_index, female_features, raw )
female_df = pd.DataFrame(female_data, columns = female_features)

In [54]:
# type 2
female_data2 = aggregate_data2('femaleid', agg_index, female_features, raw )
feature_cols = female_features[agg_index:].copy()
feature_cols.insert(0,"femaleid")
female_df2 = pd.DataFrame(female_data2, columns = feature_cols)

# Join aggreagates

In [63]:
pair_df1 = pd.merge(male_df1, female_df, on = ["maleid","femaleid", "dateorder"])
pair_df1 = pd.merge(pair_df1, dependent_df, on = ["maleid","femaleid", "dateorder"])

In [64]:
intersect_df = raw[["maleid","femaleid", "dateorder"]]
pair_df2 = pd.merge(intersect_df, female_df2, on = ["femaleid"])
pair_df2 = pd.merge(pair_df2, male_df2, on = ["maleid"])
pair_df2 = pd.merge(pair_df2, dependent_df, on = ["maleid","femaleid", "dateorder"])

# Save data

In [65]:
pair_df1.to_csv("./pair1Dataframe.csv", index = False)

In [66]:
pair_df2.to_csv("./pair2Dataframe.csv", index = False)