In [308]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler


In [294]:
import warnings
warnings.filterwarnings('ignore')

In [326]:
customers =  pd.read_csv("Customers.csv")
products =  pd.read_csv("Products.csv")
transaction =  pd.read_csv("Transactions.csv")

In [328]:
 # Perform an outer join between Transactions and Customers on CustomerID
merged_data_outer_1 = pd.merge(transaction, customers, on='CustomerID', how='outer', indicator=True)
# Perform an outer join between the above result and Products on ProductID
merged_data_outer = pd.merge(merged_data_outer_1, products, on='ProductID', how='outer')
merged_data_outer.head(3)

Unnamed: 0,TransactionID,CustomerID,ProductID,TransactionDate,Quantity,TotalValue,Price_x,CustomerName,Region,SignupDate,_merge,ProductName,Category,Price_y
0,T00758,C0017,P001,2024-05-28 14:47:15,3.0,507.9,169.3,Jennifer King,Europe,2023-12-05,both,ActiveWear Biography,Books,169.3
1,T00088,C0019,P001,2024-01-30 17:23:03,2.0,338.6,169.3,Brandon Rodriguez,Europe,2023-01-12,both,ActiveWear Biography,Books,169.3
2,T00314,C0024,P001,2024-09-24 17:15:16,4.0,677.2,169.3,Michele Cooley,North America,2024-02-05,both,ActiveWear Biography,Books,169.3


In [331]:
# Dropping columns
final_df = merged_data_outer .drop(['TransactionID','Price_y','CustomerName','ProductName','TransactionDate','SignupDate','_merge'],axis=1)


In [333]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1001 entries, 0 to 1000
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CustomerID  1001 non-null   object 
 1   ProductID   1000 non-null   object 
 2   Quantity    1000 non-null   float64
 3   TotalValue  1000 non-null   float64
 4   Price_x     1000 non-null   float64
 5   Region      1001 non-null   object 
 6   Category    1000 non-null   object 
dtypes: float64(3), object(4)
memory usage: 54.9+ KB


In [335]:
final_df.isnull().sum()

CustomerID    0
ProductID     1
Quantity      1
TotalValue    1
Price_x       1
Region        0
Category      1
dtype: int64

In [337]:
# Drop rows with any NaN
final_df= final_df.dropna()

In [339]:
final_df['Quantity'] = final_df['Quantity'].astype(int)

In [341]:
final_df

Unnamed: 0,CustomerID,ProductID,Quantity,TotalValue,Price_x,Region,Category
0,C0017,P001,3,507.90,169.30,Europe,Books
1,C0019,P001,2,338.60,169.30,Europe,Books
2,C0024,P001,4,677.20,169.30,North America,Books
3,C0036,P001,2,338.60,169.30,North America,Books
4,C0045,P001,2,338.60,169.30,Asia,Books
...,...,...,...,...,...,...,...
995,C0077,P100,2,252.68,126.34,South America,Clothing
996,C0109,P100,4,505.36,126.34,North America,Clothing
997,C0109,P100,2,252.68,126.34,North America,Clothing
998,C0156,P100,4,505.36,126.34,North America,Clothing


In [395]:
final_df['CustomerID_numeric'] = final_df['CustomerID'].str.extract('(\d+)').astype(int)

# Filter customers with IDs from 1 to 50
first_50_customers = final_df[final_df['CustomerID_numeric'].between(1, 50)]

first_50_customers

Unnamed: 0,CustomerID,ProductID,Quantity,TotalValue,Price_x,Region,Category,CustomerID_numeric
0,C0017,P001,3,507.90,169.30,Europe,Books,17
1,C0019,P001,2,338.60,169.30,Europe,Books,19
2,C0024,P001,4,677.20,169.30,North America,Books,24
3,C0036,P001,2,338.60,169.30,North America,Books,36
4,C0045,P001,2,338.60,169.30,Asia,Books,45
...,...,...,...,...,...,...,...,...
965,C0044,P096,1,307.47,307.47,Europe,Electronics,44
966,C0045,P096,3,922.41,307.47,Asia,Electronics,45
974,C0004,P097,3,958.02,319.34,South America,Books,4
983,C0041,P098,3,899.79,299.93,Europe,Electronics,41


In [397]:
customer_list = first_50_customers['CustomerID'].to_list()
customer_list[0:10]

['C0017',
 'C0019',
 'C0024',
 'C0036',
 'C0045',
 'C0003',
 'C0031',
 'C0012',
 'C0027',
 'C0002']

In [393]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Quantity', 'TotalValue', 'Price_x']),
        ('cat', OneHotEncoder(), ['Region', 'Category'])
    ])

# Apply preprocessing
X = preprocessor.fit_transform(final_df.drop(columns='CustomerID'))

In [431]:
# Apply preprocessing
X = preprocessor.fit_transform(final_df.drop(columns='CustomerID'))

def get_similar_users_as_csv(customer_ids, data, X, top_n=3, output_file='Lookalike.csv'):
    """
    Find top N similar users for a list of customer IDs and save results as a CSV.

    Parameters:
    customer_ids (list): List of CustomerIDs to find similar users for.
    data (DataFrame): Original DataFrame containing CustomerIDs.
    X (array-like): Feature matrix used for similarity calculation.
    top_n (int): Number of similar users to retrieve.
    output_file (str): Name of the output CSV file.

    Returns:
    DataFrame: A DataFrame containing CustomerID and Similarities.
    """
    results = []

    for customer_id in customer_ids:
        try:
            # Find the index of the input customer
            customer_index = data[data['CustomerID'] == customer_id].index[0]

            # Calculate similarity scores
            similarity_scores = cosine_similarity(X[customer_index:customer_index + 1], X)

            # Get indices of the top n similar users (excluding the customer itself)
            similar_users = np.argsort(similarity_scores[0])[-(top_n + 1):-1]

            # Create a list of tuples for similar users and their scores
            similar_list = [
                (data.iloc[similar_user_index]['CustomerID'], similarity_scores[0][similar_user_index])
                for similar_user_index in reversed(similar_users)
            ]

            # Append the result
            results.append({
                'CustomerID': customer_id,
                'Similarities': similar_list
            })

        except IndexError:
            # Handle cases where the customer ID is not found in the data
            continue

    # Convert the results list into a DataFrame
    results_df = pd.DataFrame(results, columns=['CustomerID', 'Similarities'])

    # Save to CSV
    results_df.to_csv(output_file, index=False)

    print(f"Results saved to {output_file}")

    return results_df

# Example usage:
custdf = get_similar_users_as_csv(customer_list, final_df, X, 3)

Results saved to Lookalike.csv


# ######################

In [423]:
# Preprocessing function
def preprocess_data(final_df):
    # One-hot encode categorical variables
    enc = OneHotEncoder()
    encoded = enc.fit_transform(final_df[['Region', 'Category', 'ProductID']]).toarray()
    encoded_df = pd.DataFrame(encoded, columns=enc.get_feature_names_out(['Region', 'Category', 'ProductID']))

    # Scale numerical features
    scaler = MinMaxScaler()
    numeric = final_df[['Quantity', 'TotalValue', 'Price_x']]
    scaled_numeric = scaler.fit_transform(numeric)
    scaled_df = pd.DataFrame(scaled_numeric, columns=numeric.columns)

    # Aggregate features by CustomerID
    features = pd.concat([final_df[['CustomerID']], scaled_df, encoded_df], axis=1)
    aggregated_features = features.groupby('CustomerID').mean()

    return aggregated_features

# Preprocess data
customer_features = preprocess_data(final_df)

# Recommendation function
def recommend_customers(customer_features, customer_id, top_n=3):
    # Extract the input customer's features
    input_customer = customer_features.loc[customer_id].values.reshape(1, -1)

    # Compute cosine similarity
    similarity_scores = cosine_similarity(input_customer, customer_features)[0]

    # Create similarity DataFrame
    similarity_df = pd.DataFrame({
        'CustomerID': customer_features.index,
        'SimilarityScore': similarity_scores
    }).sort_values(by='SimilarityScore', ascending=False)

    # Exclude the input customer and return top recommendations
    recommendations = similarity_df[similarity_df['CustomerID'] != customer_id].head(top_n)
    return recommendations

# Example: Recommend customers similar to 'C001'
similar_customers = recommend_customers(customer_features, customer_id='C0001', top_n=3)
print(similar_customers)

    CustomerID  SimilarityScore
188      C0190         0.919676
179      C0181         0.902942
101      C0102         0.902496


### `Input:`
Accepts a list of CustomerIDs, a data DataFrame (data), and a feature matrix (X) representing customer attributes.

### `Similarity Range:`
Cosine similarity values range from -1 (opposite direction) to 1 (exact match). Since we use sort_values(ascending=False), only positive values (more similar) are chosen.
### `Top Similar Users:`
 Retrieves indices of the top N most similar customers (excluding the target itself) and forms a list of tuples with SimilarCustomerID and their similarity scores.

### `Output: `
Consolidates results into a DataFrame with CustomerID and their Similarities. Saves this as a CSV and returns the DataFrame.