In [1]:
import sys
print(sys.executable)

C:\ProgramData\anaconda3\python.exe


In [2]:
!pip install yfinance

Defaulting to user installation because normal site-packages is not writeable


In [11]:
import pandas as pd
import random
import numpy as np

# Parameters for synthetic data
num_users = 1000
num_stocks = 500

# User Data
user_data = {
    "UserID": [f"U{str(i).zfill(4)}" for i in range(1, num_users + 1)],
    "Age": np.random.randint(18, 65, num_users),
    "RiskTolerance": np.random.choice(["Low", "Medium", "High"], num_users),
    "InvestmentBudget": np.random.randint(5000, 100000, num_users),
    "InvestmentGoals": np.random.choice(["Short-term", "Long-term", "Balanced"], num_users),
}

# Stock Data
stock_data = {
    "StockID": [f"S{str(i).zfill(4)}" for i in range(1, num_stocks + 1)],
    "StockName": [f"Stock_{i}" for i in range(1, num_stocks + 1)],
    "Sector": np.random.choice(["IT", "Pharma", "Finance", "Energy", "Retail"], num_stocks),
    "CurrentPrice": np.round(np.random.uniform(50, 5000, num_stocks), 2),
    "Volatility": np.round(np.random.uniform(0.5, 5, num_stocks), 2),
    "PERatio": np.round(np.random.uniform(5, 50, num_stocks), 2),
    "DividendYield": np.round(np.random.uniform(0, 8, num_stocks), 2),
}

# Market Data
market_data = {
    "MarketSentiment": np.random.choice(["Positive", "Neutral", "Negative"], num_users),
    "EconomicIndicator": np.round(np.random.uniform(-2, 8, num_users), 2),
    "IndexPerformance": np.round(np.random.uniform(-5, 10, num_users), 2),
}

# Historical Data and Recommendations
historical_data = {
    "Historical1YrReturn": np.round(np.random.uniform(-20, 30, num_stocks), 2),
    "Historical5YrReturn": np.round(np.random.uniform(-50, 100, num_stocks), 2),
    "BuySellSignal": np.random.choice(["Buy", "Sell", "Hold"], num_stocks),
    "TradingVolume": np.random.randint(1000, 1000000, num_stocks),
}

recommendations = {
    "RecommendedStockID": np.random.choice(stock_data["StockID"], num_users),
    "RecommendationRationale": np.random.choice(
        ["High Growth Potential", "Stable Returns", "Undervalued Stock"],
        num_users,
    ),
}

# Create DataFrames
users_df = pd.DataFrame(user_data)
stocks_df = pd.DataFrame(stock_data)
market_df = pd.DataFrame(market_data)
historical_df = pd.DataFrame(historical_data)
recommendations_df = pd.DataFrame(recommendations)

# Combine data for final dataset
dataset = users_df.copy()
dataset["RecommendedStockID"] = recommendations_df["RecommendedStockID"]
dataset["RecommendationRationale"] = recommendations_df["RecommendationRationale"]

# Save datasets
users_df.to_csv("Users.csv", index=False)
stocks_df.to_csv("Stocks.csv", index=False)
market_df.to_csv("MarketData.csv", index=False)
historical_df.to_csv("HistoricalData.csv", index=False)
dataset.to_csv("Recommendations.csv", index=False)

print("Synthetic datasets generated and saved!")


Synthetic datasets generated and saved!


In [21]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load the datasets
users_df = pd.read_csv("Users.csv")
stocks_df = pd.read_csv("Stocks.csv")
market_df = pd.read_csv("MarketData.csv")
historical_df = pd.read_csv("HistoricalData.csv")
recommendations_df = pd.read_csv("Recommendations.csv")

# ---- 1. Data Integration ----
# Merge user data with their recommendations
user_recommendations = users_df.merge(recommendations_df, on="UserID", how="left")

# Add StockID to historical_df (assuming the order of stocks is correct)
historical_df["StockID"] = [f"S{str(i).zfill(4)}" for i in range(1, len(historical_df) + 1)]

# Merge the historical data with user recommendations
full_data = user_recommendations.merge(
    stocks_df, left_on="RecommendedStockID", right_on="StockID", how="left"
)

# Merge market data (e.g., based on UserID or another suitable identifier)
full_data = full_data.merge(market_df, left_index=True, right_index=True)

# Merge historical data for recommended stocks
full_data = full_data.merge(
    historical_df, left_on="RecommendedStockID", right_on="StockID", how="left"
)

# ---- 2. Data Cleaning ----
# Check for missing values and handle them
missing_values = full_data.isnull().sum()
print("Missing values:\n", missing_values)

# Fill missing values with appropriate defaults or drop rows (if necessary)
full_data.fillna({
    "DividendYield": 0,  # Assume no dividend if missing
    "Historical1YrReturn": 0,  # Assume no historical return
    "Historical5YrReturn": 0,
}, inplace=True)

# ---- 3. Feature Engineering ----
# Risk-Adjusted Returns (1-year return divided by volatility)
full_data["RiskAdjustedReturn"] = (
    full_data["Historical1YrReturn"] / (1 + full_data["Volatility"])
)

# Aggregate stock performance by sectors
sector_performance = full_data.groupby("Sector")["Historical1YrReturn"].mean().reset_index()
sector_performance.rename(columns={"Historical1YrReturn": "SectorAvgReturn"}, inplace=True)
full_data = full_data.merge(sector_performance, on="Sector", how="left")

# Market Condition Index (average of economic indicator and index performance)
full_data["MarketConditionIndex"] = (
    full_data["EconomicIndicator"] + full_data["IndexPerformance"]
) / 2

# ---- 4. Data Scaling ----
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Numerical columns to scale
numerical_columns = ["CurrentPrice", "Volatility", "PERatio", "DividendYield",
                     "Historical1YrReturn", "Historical5YrReturn", "IndexPerformance"]

# Apply scaling to the numerical columns
full_data[numerical_columns] = scaler.fit_transform(full_data[numerical_columns])

# Save the prepared dataset for modeling
full_data.to_csv("Prepared_Data.csv", index=False)

# Print the processed dataset columns and missing values
print("Data preparation complete. Dataset saved as 'Prepared_Data.csv'")
print("Missing values after filling:\n", full_data.isnull().sum())
print("Columns in the final dataset:\n", full_data.columns)


Missing values:
 UserID                     0
Age_x                      0
RiskTolerance_x            0
InvestmentBudget_x         0
InvestmentGoals_x          0
Age_y                      0
RiskTolerance_y            0
InvestmentBudget_y         0
InvestmentGoals_y          0
RecommendedStockID         0
RecommendationRationale    0
StockID_x                  0
StockName                  0
Sector                     0
CurrentPrice               0
Volatility                 0
PERatio                    0
DividendYield              0
MarketSentiment            0
EconomicIndicator          0
IndexPerformance           0
Historical1YrReturn        0
Historical5YrReturn        0
BuySellSignal              0
TradingVolume              0
StockID_y                  0
dtype: int64
Data preparation complete. Dataset saved as 'Prepared_Data.csv'
Missing values after filling:
 UserID                     0
Age_x                      0
RiskTolerance_x            0
InvestmentBudget_x         0
Inves