# 📈 Project 2: Sales Performance Analysis
**Objective**: Analyze sales data to identify trends, relationships, and factors affecting sales performance.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

sns.set(style='whitegrid')

## Step 1: Load and Explore the Dataset

In [None]:
df = pd.read_csv('sales_data.csv')
print("Shape:", df.shape)
df.info()
df.head()

## Step 2: Data Cleaning

In [None]:
# Remove duplicates
df.drop_duplicates(inplace=True)

# Fill missing values with mean or median
df['Sales'].fillna(df['Sales'].mean(), inplace=True)
df['Profit'].fillna(df['Profit'].median(), inplace=True)
df['Discount'].fillna(df['Discount'].mean(), inplace=True)

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

## Step 3: Exploratory Data Analysis

### Sales Trends Over Time

In [None]:
sales_trend = df.groupby('Date')['Sales'].sum().reset_index()
plt.figure(figsize=(12,6))
plt.plot(sales_trend['Date'], sales_trend['Sales'], marker='o')
plt.title("Sales Trends Over Time")
plt.xlabel("Date")
plt.ylabel("Total Sales")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Relationship: Profit vs. Discount

In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x='Discount', y='Profit', data=df)
plt.title("Profit vs Discount")
plt.xlabel("Discount")
plt.ylabel("Profit")
plt.show()

### Sales by Region and Category

In [None]:
# Sales by Region
region_sales = df.groupby('Region')['Sales'].sum().reset_index()
plt.figure(figsize=(8,5))
sns.barplot(x='Region', y='Sales', data=region_sales)
plt.title("Total Sales by Region")
plt.xticks(rotation=45)
plt.show()

# Sales by Category
category_sales = df.groupby('Category')['Sales'].sum().reset_index()
plt.figure(figsize=(8,5))
sns.barplot(x='Category', y='Sales', data=category_sales)
plt.title("Total Sales by Category")
plt.show()

## Step 4: Predictive Modeling - Linear Regression

In [None]:
# Feature selection and model training
X = df[['Profit', 'Discount']]
y = df['Sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

### Model Evaluation

In [None]:
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
print(f"R² Score: {r2:.2f}")
print(f"Mean Squared Error: {mse:.2f}")

## Step 5: Export Cleaned Dataset

In [None]:
df.to_csv('Cleaned_Sales_Data.csv', index=False)
print("Cleaned dataset saved as 'Cleaned_Sales_Data.csv'")