<a href="https://colab.research.google.com/github/ANDRREL/python-projects/blob/main/E_S_ANDRREL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SALES FORECASTING REPORT FOR RETAIL STORES

#importing the datas
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import holidays
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error


# Make sure this file path is correct and the file exists in your Google Drive
df= pd.read_csv('/content/drive/MyDrive/Coursera.csv')
df.columns=df.columns.str.lower()

# Print column names to identify the date column
print(df.columns)

# Corrected typo: pd.to_Datetime should be pd.to_datetime
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['storeid','productid','date']).reset_index(drop=True)

df.head()

#data checks
print(df.isna().sum())
print(df[['storeid','productid']].nunique())
print(df['date'].min(), df['date'].max())
df['sales'] = df['sales'].fillna(0)


country_holidays = holidays.country_holidays('IN')

dcal = pd.DataFrame({'Date': pd.date_range(df['date'].min(), df['date'].max())})
dcal['dow']   = dcal['Date'].dt.dayofweek
dcal['dom']   = dcal['Date'].dt.day
dcal['week']  = dcal['Date'].dt.isocalendar().week.astype(int)
dcal['month'] = dcal['Date'].dt.month
dcal['year']  = dcal['Date'].dt.year
dcal['is_weekend'] = (dcal['dow']>=5).astype(int)
dcal['is_holiday'] = dcal['Date'].isin(country_holidays).astype(int)

df = df.merge(dcal, left_on='date', right_on='Date', how='left').drop(columns='Date')
df.head()

def add_lags(group, lags=(1,7,28), rolls=(7,28)):
    for l in lags:
        group[f'lag_{l}'] = group['sales'].shift(l)
    for r in rolls:
        group[f'rollmean_{r}'] = group['sales'].shift(1).rolling(r).mean()
        group[f'rollstd_{r}']  = group['sales'].shift(1).rolling(r).std()
    return group

df = df.groupby(['storeid','productid'], as_index=False).apply(add_lags).reset_index(drop=True)
df = df.dropna(subset=[c for c in df.columns if c.startswith('lag_') or c.startswith('roll')])
df.head()

cutoff = df['date'].max() - pd.Timedelta(days=90)
train = df[df['date'] <= cutoff].copy()
valid = df[df['date'] >  cutoff].copy()

features = [
    'dow','dom','week','month','year','is_weekend','is_holiday',
    'lag_1','lag_7','lag_28','rollmean_7','rollmean_28','rollstd_7','rollstd_28'
]

# Encode storeid/productid as categories for XGBoost
from sklearn.preprocessing import LabelEncoder
le_storeid, le_productid = LabelEncoder(), LabelEncoder()
df['storeid_le'] = le_storeid.fit_transform(df['storeid'])
df['productid_le']  = le_productid.fit_transform(df['productid'])

features += ['storeid_le','productid_le']

train = df[df['date'] <= cutoff].copy()
valid = df[df['date'] >  cutoff].copy()

X_train, y_train = train[features], train['sales']
X_valid, y_valid = valid[features], valid['sales']


s, i = df['storeid'].iloc[0], df['productid'].iloc[0]
ts = df[(df['storeid']==s) & (df['productid']==i)][['date','sales']].rename(columns={'date':'ds','sales':'y'})

from prophet import Prophet
m = Prophet()
m.fit(ts)
future = m.make_future_dataframe(periods=30)  # 30-day horizon
fcst = m.predict(future)[['ds','yhat','yhat_lower','yhat_upper']]
fcst.tail()

model = XGBRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.9,
    random_state=42,
    tree_method="hist"
)
model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

pred_valid = model.predict(X_valid)
mae  = mean_absolute_error(y_valid, pred_valid)
rmse = mean_squared_error(y_valid, pred_valid)
mae, rmse

# future forecast of stock
H = 30
future_Dates = pd.date_range(df['date'].max() + pd.Timedelta(days=1), periods=H)


pairs = df[['storeid','productid','storeid_le','productid_le']].drop_duplicates()
future = pairs.assign(key=1).merge(pd.DataFrame({'Date':future_Dates,'key':1}), on='key').drop('key', axis=1)


future = future.merge(dcal, on='Date', how='left')


hist_window = df[['storeid','productid','date','sales']].copy()

def build_future_features(g):
    g = g.sort_values('date')
    maxd = g['date'].max()

    f = future[(future['storeid']==g['storeid'].iloc[0]) & (future['productid']==g['productid'].iloc[0])].copy()
    f['sales'] = np.nan
    both = pd.concat([g[['date','sales']], f[['Date','sales']]], ignore_index=True)
    both = both.sort_values('Date')


    for l in (1,7,28):
        both[f'lag_{l}'] = both['sales'].shift(l)
    for r in (7,28):
        both[f'rollmean_{r}'] = both['sales'].shift(1).rolling(r).mean()
        both[f'rollstd_{r}']  = both['sales'].shift(1).rolling(r).std()


    both = both[both['Date'].isin(f['Date'])]
    out = f.merge(both.drop(columns='sales'), on='Date', how='left', suffixes=('',''))
    return out

future_feats = (
    hist_window.groupby(['storeid','productid'], group_keys=False)
    .apply(build_future_features)
    .reset_index(drop=True)
)


future_feats = future_feats.merge(pairs[['storeid','productid','storeid_le','productid_le']].drop_duplicates(), on=['storeid','productid'], how='left')
future_feats = future_feats.merge(dcal, left_on='date', right_on='Date', how='left', suffixes=('','_cal')).drop(columns='Date_cal')

X_future = future_feats[features].copy()

X_future = X_future.fillna(0)

future_preds = model.predict(X_future)
future_out = future_feats[['date','storeid','productid']].copy()
future_out['forecast'] = np.maximum(0, future_preds)

future_out.head()

# exporting to powerBI
valid_export = valid[['partner','course','skills','rating','reviewcount','level','reviewcount','level']].copy()
valid_export['forecast'] = pred_valid
valid_export.to_csv('/content/drive/MyDrive/Coursera.csv', index=False)
future_out.to_csv('/content/drive/MyDrive/Coursera.csv', index=False)
df[['partner','course','skills','rating','reviewcount','level']].to_csv('/content/drive/MyDrive/Coursera.csv', index=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Index(['partner', 'course', 'skills', 'rating', 'reviewcount', 'level',
       'certificatetype', 'duration', 'crediteligibility'],
      dtype='object')


KeyError: 'date'

In [None]:
!pip install pandas scikit-learn surprise
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
df = pd.read_csv("Coursera.csv")   # Load your dataset
df.head()
# Combine useful features
df["features"] = df["title"] + " " + df["tags"].fillna("")

# Convert text into vectors
vectorizer = TfidfVectorizer(stop_words="english")
feature_matrix = vectorizer.fit_transform(df["features"])

# Compute similarity
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

# Function to recommend similar courses
def recommend_content(course_title, top_n=5):
    idx = df[df["title"] == course_title].index[0]
    scores = list(enumerate(cosine_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:top_n+1]
    courses = [df.iloc[i[0]]["title"] for i in scores]
    return courses

print("📚 Content-based recommendations for 'Python for Beginners':")
print(recommend_content("Python for Beginners"))
# Prepare data for Surprise library
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["partner", "course", "reviewcount"]], reader)

trainset, testset = train_test_split(data, test_size=0.2)

# Train SVD model
model = SVD()
model.fit(trainset)

# Predict ratings
predictions = model.test(testset)

# Function to recommend courses for a user
def recommend_collaborative(partner, top_n=5):
    all_courses = df["course"].unique()
    user_courses = df[df["partner"] == partner]["course"].unique()
    remaining_courses = [c for c in all_courses if c not in user_courses]

    preds = [(cid, model.predict(partner, cid).est) for cid in remaining_courses]
    preds = sorted(preds, key=lambda x: x[1], reverse=True)[:top_n]

    return df[df["course"].isin([cid for cid, _ in preds])]["title"].tolist()

print("🤝 Collaborative recommendations for user U1:")
print(recommend_collaborative("U1"))
def hybrid_recommendation(partner, course_title, top_n=5):
    content_recs = recommend_content(course_title, top_n=10)
    collab_recs = recommend_collaborative(partner, top_n=10)
    final_recs = list(set(content_recs) | set(collab_recs))[:top_n]
    return final_recs

print("🌟 Hybrid recommendations for U1 who liked 'Python for Beginners':")
print(hybrid_recommendation("U1", "Python for Beginners"))
