In [21]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import numpy as np

customers_df=pd.read_csv('/content/Customers.csv')
products_df =pd.read_csv('/content/Products.csv')
products_df.isnull().sum()


Unnamed: 0,0
ProductID,0
ProductName,0
Category,0
Price,0


In [14]:
product_categories = products_df['Category'].value_counts()
product_categories

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
Books,26
Electronics,26
Clothing,25
Home Decor,23


In [16]:
product_categories = products_df['Category'].unique()
interest_matrix = pd.DataFrame(columns=product_categories)
interest_matrix

Unnamed: 0,Books,Electronics,Home Decor,Clothing


In [26]:

def preprocess_data(customers_df):

    customers_df['Region'] = customers_df['Region'].factorize()[0]


    customers_df['SignupDate'] = pd.to_datetime(customers_df['SignupDate'])
    customers_df['TimeSinceSignup'] = (pd.to_datetime('today') - customers_df['SignupDate']).dt.days


    scaler = StandardScaler()
    customers_df[['Region', 'TimeSinceSignup']] = scaler.fit_transform(customers_df[['Region', 'TimeSinceSignup']])

    return customers_df

customers_df = preprocess_data(customers_df)


customers_df['TotalSpent'] = np.random.uniform(100, 1000, len(customers_df))  # Random total spent
customers_df['PurchaseFrequency'] = np.random.randint(1, 20, len(customers_df))  # Random frequency of purchases


customers_df[['TotalSpent', 'PurchaseFrequency']] = scaler.fit_transform(customers_df[['TotalSpent', 'PurchaseFrequency']])


def combine_features(customers_df):
    feature_columns = ['Region', 'TimeSinceSignup', 'TotalSpent', 'PurchaseFrequency']
    feature_matrix = customers_df[feature_columns].values
    return feature_matrix

feature_matrix = combine_features(customers_df)


def calculate_similarity(feature_matrix):
    similarity_matrix = cosine_similarity(feature_matrix)
    return similarity_matrix

similarity_matrix = calculate_similarity(feature_matrix)


def get_top_lookalikes(similarity_matrix, customers_df, top_n=3):
    lookalikes = {}

    for i in range(len(similarity_matrix)):

        similarity_scores = similarity_matrix[i]
        similar_customer_indices = np.argsort(similarity_scores)[::-1][1:top_n+1]


        customer_id = customers_df.iloc[i]['CustomerID']
        lookalikes[customer_id] = [(customers_df.iloc[j]['CustomerID'], similarity_scores[j]) for j in similar_customer_indices]

    return lookalikes

lookalikes = get_top_lookalikes(similarity_matrix, customers_df)


lookalike_data = []
for cust_id, similar_customers in lookalikes.items():
    for similar_cust, score in similar_customers:
        lookalike_data.append({'cust_id': cust_id, 'similar_cust_id': similar_cust, 'similarity_score': score})

lookalike_df = pd.DataFrame(lookalike_data)

# Save to CSV
lookalike_df.to_csv("Lookalike.csv", index=False)

# Show the first 20 customers' lookalikes (for evaluation)
print(lookalike_df.head(60))

   cust_id similar_cust_id  similarity_score
0    C0001           C0153          0.988496
1    C0001           C0083          0.985177
2    C0001           C0108          0.955694
3    C0002           C0162          0.999636
4    C0002           C0040          0.973985
5    C0002           C0104          0.885848
6    C0003           C0032          0.988283
7    C0003           C0087          0.907049
8    C0003           C0142          0.882977
9    C0004           C0130          0.968899
10   C0004           C0113          0.961335
11   C0004           C0027          0.938115
12   C0005           C0113          0.928214
13   C0005           C0025          0.911152
14   C0005           C0184          0.875813
15   C0006           C0126          0.953631
16   C0006           C0163          0.929787
17   C0006           C0052          0.901274
18   C0007           C0159          0.964957
19   C0007           C0161          0.953180
20   C0007           C0118          0.927513
21   C0008