In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px


for col in ['Transaction_ID', 'Customer_ID', 'Phone','Zipcode' ,'Age','Year' ,'Ratings']:
    if data[col].isna().any():
        # Calculating the mean of the column (ignoring NaN values)
        col_mean = data[col].mean(skipna=True)

        # Filling NaN values with the mean
        data[col].fillna(col_mean, inplace=True)

    # Converting the column to integer dtype
    data[col] = data[col].astype(int)

print(data.isnull().sum())



data['Date'] = pd.to_datetime(data['Date']).dt.strftime('%Y-%m-%d')
data['Time'] = pd.to_datetime(data['Time']).dt.strftime('%H:%M:%S')

data.drop_duplicates(inplace=True)

# Filter rows where any of the specified columns have value 0 or negative
filtered_data = data[(data['Total_Purchases'] <= 0) | (data['Amount'] <= 0) | (data['Total_Amount'] <= 0)]

print(filtered_data)


data['Total_Purchases'] = data['Total_Purchases'].interpolate(direction='both')
data['Amount'] = data['Amount'].interpolate(direction='both')

data['Total_Amount'] = data['Total_Purchases'] * data['Amount']

# Fill null values in categorical columns with mode                                      #same in Categorical col of you have more time some values then it was getting mode from it ...
categorical_cols = data.select_dtypes(include=['object']).columns                          # like : red,blue,gree,red  : then mode id red it was fill in : NAN values ......
data[categorical_cols] = data[categorical_cols].fillna(data[categorical_cols].mode().iloc[0])

cat_columns = data.select_dtypes(include=['object']).columns
for col in cat_columns:
    data[col].fillna(data[col].mode()[0], inplace=True)


from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
segmentation_features = data[['Total_Purchases', 'Total_Amount']].dropna()
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_features = scaler.fit_transform(segmentation_features)

inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    inertia.append(kmeans.inertia_)

plt.plot(range(1, 11), inertia, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.show()



kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(scaled_features)


sns.scatterplot(x=data['Total_Purchases'], y=data['Total_Amount'], hue=data['Cluster'], palette='viridis')
plt.title('Customer Segmentation')
plt.xlabel('Total Purchases')
plt.ylabel('Total Amount')
plt.show()



# Analyze cluster characteristics
cluster_summary = data.groupby('Cluster').agg({
    'Total_Purchases': ['mean', 'count', 'sum'],
    'Total_Amount': ['mean', 'sum'],
}).reset_index()

# Rename columns for clarity
cluster_summary.columns = ['Cluster',
                           'Avg_Purchases', 'Customer_Count', 'Total_Purchases',
                           'Avg_Amount', 'Total_Amount']

print("Cluster Summary:")
print(cluster_summary)


# Filter high-value customers (Cluster 2)
high_value_customers = data[data['Cluster'] == 2]

# Save this segment for marketing
high_value_customers.to_csv('high_value_customers.csv', index=False)

print(f"Number of High-Value Customers: {len(high_value_customers)}")


# Filter low-value customers (Cluster 0)
low_value_customers = data[data['Cluster'] == 0]

# Save this segment for re-engagement
low_value_customers.to_csv('low_value_customers.csv', index=False)

print(f"Number of Low-Value Customers: {len(low_value_customers)}")




# Example: Analyzing product preferences by cluster
product_preferences = data.groupby(['Cluster', 'Product_Category']).size().unstack().fillna(0)

print("Product Preferences by Cluster:")
print(product_preferences)




# Example: Track clusters over time (if you have a 'Year' column)
cluster_trends = data.groupby(['Year', 'Cluster']).size().unstack(fill_value=0)

print("Cluster Trends Over Time:")
print(cluster_trends)

# Visualize the trends
cluster_trends.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title("Cluster Distribution Over Time")
plt.xlabel("Year")
plt.ylabel("Number of Customers")
plt.show()


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Use relevant features for prediction
features = data[['Total_Purchases', 'Total_Amount']]
target = data['Cluster']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Evaluate the model
accuracy = clf.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")


# Example: New customer data for prediction
new_customer_data = pd.DataFrame({
    'Total_Purchases': [10, 25, 5],
    'Total_Amount': [1200, 4000, 500]
})

# Ensure only the same feature columns are used
# (Matching the columns used during model training)
predict_features = new_customer_data[['Total_Purchases', 'Total_Amount']]

# Predict the cluster
predicted_clusters = clf.predict(predict_features)

# Add predictions to the new data
new_customer_data['Predicted_Cluster'] = predicted_clusters
print(new_customer_data)


import pickle


file_name = 'customer_segmentation_model.pkl'
pickle.dump(clf,open (file_name,'wb'))



loaded_model=pickle.load(open(file_name,'rb'))



input_data = pd.DataFrame({
    'Total_Purchases': [10, 25, 5],
    'Total_Amount': [1200, 4000, 500]
})

predictions = []  # To store predictions for each row
for index in input_data.index:
    row_data = input_data.loc[[index]]  # Get data for current row
    row_data_as_numpy_array = np.asarray(row_data)
    row_data_reshaped = row_data_as_numpy_array.reshape(1, -1)  # Reshape to (1, 2)
    prediction = loaded_model.predict(row_data_reshaped)
    predictions.append(prediction[0])  # Store prediction for current row

print(predictions)  # Print all predictions

for prediction in predictions:
    if prediction == 0:
        print('Low Value Customer')
    elif prediction == 1:
        print('Mid Value Customer')
    else:
        print('High Value Customer')


import joblib

# Save the trained clustering model
joblib.dump(clf, 'customer_segmentation_model.pkl')

print("Model saved successfully!")


from flask import Flask, request, jsonify
import pandas as pd
import joblib

# Load the saved model
model = joblib.load('customer_segmentation_model.pkl')

# Initialize Flask app
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    try:
        # Get JSON data from request
        data = request.json

        # Input validation (check if required keys are present)
        required_keys = ['Total_Purchases', 'Total_Amount']
        if not all(key in data for key in required_keys):
            return jsonify({'error': 'Missing required keys'}), 400

        # Convert input JSON to DataFrame
        input_data = pd.DataFrame(data)

        # Ensure only the same features used in training are present
        features = input_data[['Total_Purchases', 'Total_Amount']]

        # Make predictions
        predictions = clf.predict(features)

        # Add predictions to the response
        input_data['Predicted_Cluster'] = predictions
        response = input_data.to_dict(orient='records')

        return jsonify(response)
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)



