<a href="https://colab.research.google.com/github/00abhinav-u/Data-Science-Project/blob/main/Online_Retail_Recommendation_ipynb_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install pandas numpy scipy scikit-learn matplotlib seaborn

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(file_path="/content/retail_data.csv.xlsx"):
    try:
        df = pd.read_excel(file_path)
        df = df.dropna(subset=['CustomerID'])
        df['CustomerID'] = df['CustomerID'].astype(int)
        df = df[df['Quantity'] > 0]
        df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
        df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        return None

def create_user_item_matrix(df):
    user_item_matrix = pd.pivot_table(df,
                                    index='CustomerID',
                                    columns='StockCode',
                                    values='Quantity',
                                    aggfunc='sum',
                                    fill_value=0)
    sparse_matrix = csr_matrix(user_item_matrix.values)
    return sparse_matrix, user_item_matrix

def calculate_similarity(sparse_matrix):
    item_similarity = cosine_similarity(sparse_matrix.T)
    return item_similarity

def get_recommendations(customer_id, user_item_matrix, item_similarity, n_recommendations=5):
    customer_purchases = user_item_matrix.loc[customer_id]
    unpurchased_items = customer_purchases[customer_purchases == 0].index
    scores = pd.Series(0, index=unpurchased_items)
    purchased_items = customer_purchases[customer_purchases > 0].index
    for item in unpurchased_items:
        item_idx = user_item_matrix.columns.get_loc(item)
        similarity_scores = []
        for purchased_item in purchased_items:
            purchased_idx = user_item_matrix.columns.get_loc(purchased_item)
            similarity_scores.append(item_similarity[item_idx][purchased_idx])
        if similarity_scores:
            scores[item] = np.mean(similarity_scores)
    recommendations = scores.sort_values(ascending=False).head(n_recommendations)
    return recommendations

def plot_purchase_distribution(df):
    plt.figure(figsize=(10, 6))
    sns.histplot(data=df, x='Quantity', bins=50)
    plt.title('Distribution of Purchase Quantities')
    plt.xlabel('Quantity')
    plt.ylabel('Count')
    plt.show()

def plot_top_products(df):
    top_products = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(10)
    plt.figure(figsize=(12, 6))
    sns.barplot(x=top_products.values, y=top_products.index)
    plt.title('Top 10 Most Purchased Products')
    plt.xlabel('Total Quantity')
    plt.ylabel('Product Description')
    plt.show()

def main():
    file_path = "/content/retail_data.csv.xlsx"
    print("Loading and preprocessing data...")
    df = load_and_preprocess_data(file_path)
    if df is None:
        return
    print("\nDataset Info:")
    print(df.info())
    print("\nDataset Shape:", df.shape)
    print("\nSample Data:")
    print(df.head())
    print("\nCreating user-item matrix...")
    sparse_matrix, user_item_matrix = create_user_item_matrix(df)
    print("Calculating item similarities...")
    item_similarity = calculate_similarity(sparse_matrix)
    sample_customer = df['CustomerID'].iloc[0]
    print(f"\nGenerating recommendations for Customer {sample_customer}...")
    recommendations = get_recommendations(sample_customer, user_item_matrix, item_similarity)
    stock_to_desc = df[['StockCode', 'Description']].drop_duplicates().set_index('StockCode')
    recommended_products = recommendations.index.map(lambda x: stock_to_desc.loc[x, 'Description']
                                                    if x in stock_to_desc.index else x)
    print("\nTop 5 Recommended Products:")
    for i, (product, score) in enumerate(zip(recommended_products, recommendations.values), 1):
        print(f"{i}. {product} (Similarity Score: {score:.4f})")
    print("\nGenerating visualizations...")
    plot_purchase_distribution(df)
    plot_top_products(df)

if __name__ == "__main__":
    main()