# CODSOFT INTERNSHIP
# Task 4: Sales Prediction Using Pythom
# Author: Aab El Roi
# Field: Data Science
# Batch: September 2024

# AIM

# To develop a robust sales prediction model using machine learning techniques in Python. This model will forecast future sales volumes by analyzing historical sales data and considering various influencing factors such as advertising expenditure, target audience segmentation, and advertising platform selection. The goal is to enable businesses to make informed decisions on optimizing their advertising strategies and maximizing sales potential through accurate and actionable predictions.

# 1: Load and Explore Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/CODSOFT Internship/TASK 4/advertising.csv')


# Display the first few rows of the dataset to check column names
print(data.head())

# Basic statistics
print(data.describe())

# Plot the distribution of Advertising and Sales
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)

# Check for the correct column name in your dataset and replace 'Advertising' if necessary
plt.hist(data['TV'], bins=20, color='skyblue', edgecolor='black')

plt.title('Distribution of Advertising Expenditure')
plt.xlabel('Advertising Expenditure')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(data['Sales'], bins=20, color='salmon', edgecolor='black')
plt.title('Distribution of Sales')
plt.xlabel('Sales')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

      TV  Radio  Newspaper  Sales
0  230.1   37.8       69.2   22.1
1   44.5   39.3       45.1   10.4
2   17.2   45.9       69.3   12.0
3  151.5   41.3       58.5   16.5
4  180.8   10.8       58.4   17.9
               TV       Radio   Newspaper       Sales
count  200.000000  200.000000  200.000000  200.000000
mean   147.042500   23.264000   30.554000   15.130500
std     85.854236   14.846809   21.778621    5.283892
min      0.700000    0.000000    0.300000    1.600000
25%     74.375000    9.975000   12.750000   11.000000
50%    149.750000   22.900000   25.750000   16.000000
75%    218.825000   36.525000   45.100000   19.050000
max    296.400000   49.600000  114.000000   27.000000


# 2: Prepare Data and Train Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Features and target variable
X = data[['TV']]  # Replace 'Advertising' with the actual column name from your dataset
y = data['Sales']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# 3: Visualize the Regression Line

In [None]:
# Plotting the regression line

plt.figure(figsize=(8, 6))
plt.scatter(X_test, y_test, color='black', label='Actual data')
plt.plot(X_test, y_pred, color='blue', linewidth=3, label='Regression line')
plt.xlabel('Advertising Expenditure')
plt.ylabel('Sales')
plt.title('Advertising vs Sales with Regression Line')
plt.legend()
plt.show()

# 4: Save the Model

In [None]:
import joblib

# Save the model
joblib.dump(model, '/content/sales_model.pkl')

# Save the data for visualization in Streamlit
data.to_csv('/content/sales_data.csv', index=False)