In [3]:
import pandas as pd
import random

def generate_synthetic_data(num_rows=120):
    data = []
    for _ in range(num_rows):
        # Generate random age and gender
        age = random.randint(18, 30)
        gender = random.choice(["Male", "Female"])
        major = random.choice(["CSE", "EEE", "Business", "Engineering"])
        
        # Generate distance
        distance = round(random.uniform(0.5, 15.0), 1)
        
        # Choose mode of transport based on distance
        if distance < 2.0:
            mode_of_transport = random.choice(["Walk", "Rickshaw"])
        else:
            mode_of_transport = random.choice(["Bus", "Car"])
        
        # Traffic condition (random except no traffic for Walk)
        if mode_of_transport == "Walk":
            traffic = "Low"
        else:
            traffic = random.choice(["Low", "Medium", "High"])
        
        # Commute time depends on distance and traffic
        if mode_of_transport == "Walk":
            commute_time = round(distance * 10, 1)  # Approx 10 min per km
        elif mode_of_transport == "Rickshaw":
            if traffic == "Low":
                commute_time = round(distance * 12, 1)
            elif traffic == "Medium":
                commute_time = round(distance * 15, 1)
            else:  # High traffic
                commute_time = round(distance * 18, 1)
        elif mode_of_transport == "Bus":
            if traffic == "Low":
                commute_time = round(distance * 8, 1)
            elif traffic == "Medium":
                commute_time = round(distance * 12, 1)
            else:  # High traffic
                commute_time = round(distance * 15, 1)
        else:  # Car
            if traffic == "Low":
                commute_time = round(distance * 7, 1)
            elif traffic == "Medium":
                commute_time = round(distance * 10, 1)
            else:  # High traffic
                commute_time = round(distance * 14, 1)
        
        # Append the row to the dataset
        data.append({
            "Age": age,
            "Gender": gender,
            "Major": major,
            "Distance (km)": distance,
            "Mode of Transportation": mode_of_transport,
            "Traffic": traffic,
            "Commute Time (min)": commute_time
        })
    
    # Convert to a DataFrame
    df = pd.DataFrame(data)
    return df

# Generate the dataset
synthetic_data = generate_synthetic_data(110)

# Display the first few rows
print(synthetic_data.head())

# Optionally save to CSV
synthetic_data.to_csv("commute_data_saib.csv", index=False)


   Age  Gender        Major  Distance (km) Mode of Transportation Traffic  \
0   25    Male          CSE            3.7                    Bus  Medium   
1   27    Male          CSE           10.3                    Bus    High   
2   30  Female          EEE           10.8                    Bus     Low   
3   28  Female          EEE           14.0                    Car  Medium   
4   20    Male  Engineering           11.7                    Car  Medium   

   Commute Time (min)  
0                44.4  
1               154.5  
2                86.4  
3               140.0  
4               117.0  


In [4]:
import pandas as pd
import random

# Function to generate realistic university student data with outliers
def generate_student_data_with_outliers(n=130):
    data = []

    for _ in range(n):
        # Basic demographic details
        age = random.randint(18, 26)  # University age range
        gender = random.choice(["Male", "Female"])
        major = random.choice(["Computer Science", "EEE", "CSE", "IT", "Business", "Mechanical", "Biology", "Psychology"])

        # Social media usage hours (normal and outlier ranges)
        if random.random() < 0.05:  # 5% chance for outliers
            hours_social_media = random.choice([0, random.randint(10, 15)])
        else:
            hours_social_media = round(random.uniform(0, 8), 1)

        # Friends/followers count (normal and outlier ranges)
        if random.random() < 0.05:  # 5% chance for extreme values
            friends_followers = random.choice([0, random.randint(5000, 10000)])
        else:
            friends_followers = random.randint(50, 1500)

        # Academic performance logic
        if hours_social_media == 0:  # Exceptionally low usage
            academic_performance = random.choices(["Good", "Poor"], weights=[0.9, 0.1])[0]
        elif hours_social_media >= 10:  # Exceptionally high usage
            academic_performance = random.choices(["Good", "Poor"], weights=[0.2, 0.8])[0]
        else:  # Normal usage
            if hours_social_media < 1:
                academic_performance = random.choices(["Good", "Average"], weights=[0.8, 0.2])[0]
            elif 1 <= hours_social_media <= 3:
                academic_performance = random.choices(["Good", "Average"], weights=[0.6, 0.4])[0]
            elif 3 < hours_social_media <= 5:
                academic_performance = random.choices(["Average", "Good", "Poor"], weights=[0.5, 0.3, 0.2])[0]
            elif 5 < hours_social_media <= 8:
                academic_performance = random.choices(["Average", "Poor", "Good"], weights=[0.5, 0.4, 0.1])[0]

        # Primary social media platform
        platform = random.choice(["Facebook", "Instagram", "Twitter", "LinkedIn", "Snapchat", "TikTok"])

        # Frequency of posting logic
        if hours_social_media >= 10:
            posting_frequency = random.choices(["Sometimes", "Often", "Daily"], weights=[0.3, 0.4, 0.3])[0]
        elif hours_social_media == 0:
            posting_frequency = "Never"
        else:
            posting_frequency = random.choices(["Never", "Rarely", "Sometimes", "Often"], weights=[0.1, 0.5, 0.3, 0.1])[0]

        # Add the row to the dataset
        data.append({
            "Age": age,
            "Gender": gender,
            "Major": major,
            "Hours on Social Media": hours_social_media,
            "Friends/Followers": friends_followers,
            "Social Media Platform": platform,
            "Posting Frequency": posting_frequency,
            "Academic Performance": academic_performance
        })

    return pd.DataFrame(data)

# Generate the dataset
df = generate_student_data_with_outliers(125)

# Display a sample of the data
print(df.head(20))

# Save the dataset to a CSV file
df.to_csv("university_students_with_outliers_saib.csv", index=False)


    Age  Gender             Major  Hours on Social Media  Friends/Followers  \
0    25    Male        Psychology                    0.9                438   
1    23    Male                IT                    6.3                  0   
2    21    Male           Biology                    3.7                901   
3    22    Male        Psychology                    6.7                760   
4    18  Female                IT                    5.2                390   
5    26  Female                IT                    1.1                965   
6    20    Male               CSE                    3.2               1386   
7    20    Male          Business                    7.3               1329   
8    18  Female                IT                    2.1                597   
9    22    Male           Biology                    3.0                411   
10   20    Male        Mechanical                    6.4               1213   
11   26    Male  Computer Science                   

In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Set seed for reproducibility
np.random.seed(42)

# Number of synthetic samples to generate
n_samples = 130

# Generate synthetic numerical features
ages = np.random.randint(18, 60, size=n_samples)  # Age range from 18 to 60
hours_of_sleep = np.random.normal(loc=7, scale=1.5, size=n_samples).clip(3, 10)  # Sleep hours between 3 and 10
stress_levels = np.random.randint(1, 11, size=n_samples)  # Stress level from 1 to 10
exercise_frequency = np.random.randint(0, 7, size=n_samples)  # Exercise days per week
social_interaction_frequency = np.random.randint(0, 7, size=n_samples)  # Social interactions per week

# Generate synthetic categorical features
genders = np.random.choice(['Male', 'Female', 'Other'], size=n_samples, p=[0.48, 0.48, 0.04])
majors = np.random.choice(['Engineering', 'Business', 'Arts', 'Science', 'Other'], size=n_samples)
social_media = np.random.choice(['Instagram', 'Facebook', 'Twitter', 'LinkedIn', 'None'], size=n_samples)

# Create the DataFrame
synthetic_data = pd.DataFrame({
    'Age': ages,
    'Gender': genders,
    'Hours of Sleep per Night': np.round(hours_of_sleep, 1),
    'Stress Level': stress_levels,
    'Exercise Frequency': exercise_frequency,
    'Social Interaction Frequency': social_interaction_frequency
})

# Display a sample of the synthetic dataset
print(synthetic_data.head())

# Save synthetic data to a CSV file
synthetic_data.to_csv('synthetic_mental_health_data_saib.csv', index=False)
print("Synthetic dataset saved to 'synthetic_mental_health_data.csv'.")


   Age  Gender  Hours of Sleep per Night  Stress Level  Exercise Frequency  \
0   56    Male                       7.2             3                   6   
1   46    Male                       6.8             2                   1   
2   32    Male                      10.0             9                   0   
3   25  Female                       5.5            10                   3   
4   38    Male                       5.8             6                   2   

   Social Interaction Frequency  
0                             2  
1                             2  
2                             1  
3                             6  
4                             3  
Synthetic dataset saved to 'synthetic_mental_health_data.csv'.
