In [11]:
import os
import pandas as pd
import numpy as np

# Specify the folder containing the Excel files
folder_path = './4 - Processed Data Files'

# Get a list of all files in the folder
file_list = os.listdir(folder_path)

# Filter only Excel files
excel_files = [file for file in file_list if file.endswith('.xlsx')]

# Sort the file list to ensure consistent ordering
excel_files.sort()

# Iterate through the Excel files and create DataFrames
dataframes = {}
for i, file in enumerate(excel_files, start=1):
    df_name = f'df{i}'
    file_path = os.path.join(folder_path, file)
    dataframes[df_name] = pd.read_excel(file_path)

merged_df = pd.concat(dataframes.values(), ignore_index=True)

In [58]:
# # Assigning DataFrames to variables
# df1 = dataframes['df1']
# df2 = dataframes['df2']
# df3 = dataframes['df3']
# df4 = dataframes['df4']
# df5 = dataframes['df5']
# df6 = dataframes['df6']
# df7 = dataframes['df7']
# df8 = dataframes['df8']
# df9 = dataframes['df9']
# df10 = dataframes['df10']
# df11 = dataframes['df11']
# df12 = dataframes['df12']
# df13 = dataframes['df13']
# df14 = dataframes['df14']
# df15 = dataframes['df15']

# # Concatenate DataFrames vertically
# merged_df = pd.concat([df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13, df14, df15], ignore_index=True)


In [13]:
# Step 1: Drop columns where all values are "NA"
merged_df = merged_df.replace("NA", np.nan)  # Convert "NA" strings to actual NaN values
merged_df = merged_df.dropna(axis=1, how='all')  # Drop columns with all NaN values

# For remaining 'NA' values not in fully 'NA' columns, replace with 0
merged_df = merged_df.replace(np.nan, 0)

In [62]:
# Convert the corrected dictionary to a DataFrame
mock_df_corrected = merged_df

# Convert timestamps to datetime objects
mock_df_corrected['timeStamp'] = pd.to_datetime(mock_df_corrected['timeStamp'])

# Extract time-based features
mock_df_corrected['minute'] = mock_df_corrected['timeStamp'].dt.minute
mock_df_corrected['hour'] = mock_df_corrected['timeStamp'].dt.hour
mock_df_corrected['weekday'] = mock_df_corrected['timeStamp'].dt.weekday  # Monday=0, Sunday=6
mock_df_corrected['day_of_month'] = mock_df_corrected['timeStamp'].dt.day

# Now, you can drop the 'timeStamp' column to avoid dtype issues in TPOT
mock_df_corrected.drop(['timeStamp'], axis=1, inplace=True)

In [14]:
merged_df["customer_email"].unique()

array(['benedict.s@hotmail.co.uk', 'ellenmcarthur97@gmail.com',
       'oms42deadman@gmail.com', 'shutko.vladyslav@gmail.com'],
      dtype=object)

In [63]:
friendship_info = {
   "benedict.s@hotmai.co.uk": ["ellenmcarthur97@gmail.com'", "oms42deadman@gmail.com", "shutko.vladyslav@gmail.com"],
   "ellenmcarthur97@gmail.com": ["benedict.s@hotmail.co.uk"],
   "oms42deadman@gmail.com": ["benedict.s@hotmail.co.uk", "shutko.vladyslav@gmail.com"],
   "shutko.vladyslav@gmail.com": ["benedict.s@hotmail.co.uk", "oms42deadman@gmail.com"],   
}

# Extract unique email addresses (individuals) from the mock dataset
individuals = mock_df_corrected['customer_email'].unique()

# Initialize a DataFrame with zeros, using individuals as both rows and columns
friendship_matrix = pd.DataFrame(0, index=individuals, columns=individuals)

# Populate the matrix based on the friendship_info dictionary
for person, friends in friendship_info.items():
    for friend in friends:
        friendship_matrix.loc[person, friend] = 1
        friendship_matrix.loc[friend, person] = 1  # Friendship is mutual


In [64]:
# Assuming mock_df_corrected is our transaction dataset
features = mock_df_corrected.drop(['customer_email'], axis=1)  # Excluding email for feature encoding
features_encoded = pd.get_dummies(features)

In [65]:
# Assuming `features_encoded` is your encoded transaction data
# Let's first ensure the customer email is included for aggregation
mock_df_corrected['customer_email'] = mock_df_corrected['customer_email'].astype('category')
features_with_email = pd.concat([mock_df_corrected['customer_email'], features_encoded], axis=1)

# Aggregate features by customer email
individual_features = features_with_email.groupby('customer_email').mean()

# Ensure the index is consistent for later operations
individual_features = individual_features.reindex(friendship_matrix.index)

  individual_features = features_with_email.groupby('customer_email').mean()


In [66]:
X_train_list = []
y_train_list = []

for i, email_i in enumerate(friendship_matrix.index):
    for j, email_j in enumerate(friendship_matrix.columns):
        if i < j:  # Avoid duplicate pairs and self-pairing
            # Combine features of both individuals in the pair
            features_pair = pd.concat([individual_features.loc[email_i], individual_features.loc[email_j]], axis=0).to_list()
            X_train_list.append(features_pair)
            # Corresponding friendship status
            y_train_list.append(friendship_matrix.loc[email_i, email_j])

# Convert lists to DataFrame and Series for X_train and y_train
X_train = pd.DataFrame(X_train_list)
y_train = pd.Series(y_train_list)


In [69]:
from tpot import TPOTClassifier
import sklearn.model_selection

# Assuming your X_train and y_train are correctly prepared
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X_train, y_train, 
                                                                            test_size=0.25, random_state=42)

# Initialize TPOT classifier with a mix of specified and default parameters
tpot = TPOTClassifier(
    generations=10,  # Custom: 10 generations for the optimization process.
    population_size=100,  # Custom: 100 individuals in the population per generation.
    verbosity=2,  # Custom: Show more detailed progress and pipeline information.
    random_state=42,  # Custom: Ensure reproducibility of your results.
    scoring=None,  # Default: Use TPOT's default scoring method (accuracy for classification).
    cv=None,  # Default: TPOT will use a 5-fold cross-validation.
    max_time_mins=None,  # Default: No maximum time limit on the optimization process.
    early_stop=None,  # Default: No early stopping. Runs for the full number of generations.
    config_dict=None  # Default: TPOT will use its default configuration dictionary.
)

# Fit the TPOT model to your data
tpot.fit(X_train, y_train)

# Evaluate the model on the test set
print(f\nModel accruacy score: {tpot.score(X_test, y_test)}%")

# Optionally, export the pipeline to a Python file
# tpot.export('tpot_best_pipeline.py')


                                                                                
Generation 1 - Current best internal CV score: 0.8803030303030303
                                                                                 
Generation 2 - Current best internal CV score: 0.8803030303030303
                                                                               
Generation 3 - Current best internal CV score: 0.8803030303030303
                                                                               
Generation 4 - Current best internal CV score: 0.8803030303030303
                                                                               
Generation 5 - Current best internal CV score: 0.8803030303030303
                                                                               
Generation 6 - Current best internal CV score: 0.8803030303030303
                                                                               
Generation 7 - Current best internal CV s