<a href="https://colab.research.google.com/github/Ermias-Azanaw/Payroll-system/blob/main/student_performance_bigdata_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# Number of rows
n_rows = 65000

# Generate data
data = {
    'school': np.random.choice(['GP', 'MS'], n_rows, p=[0.7, 0.3]),
    'sex': np.random.choice(['F', 'M'], n_rows, p=[0.5, 0.5]),
    'age': np.random.randint(15, 23, n_rows),
    'address': np.random.choice(['U', 'R'], n_rows, p=[0.7, 0.3]),
    'famsize': np.random.choice(['GT3', 'LE3'], n_rows, p=[0.6, 0.4]),
    'Pstatus': np.random.choice(['T', 'A'], n_rows, p=[0.8, 0.2]),
    'Medu': np.random.randint(0, 5, n_rows),
    'Fedu': np.random.randint(0, 5, n_rows),
    'Mjob': np.random.choice(['teacher', 'health', 'services', 'at_home', 'other'], n_rows),
    'Fjob': np.random.choice(['teacher', 'health', 'services', 'at_home', 'other'], n_rows),
    'reason': np.random.choice(['home', 'reputation', 'course', 'other'], n_rows),
    'guardian': np.random.choice(['mother', 'father', 'other'], n_rows, p=[0.6, 0.3, 0.1]),
    'traveltime': np.random.randint(1, 5, n_rows),
    'studytime': np.random.randint(1, 5, n_rows),
    'failures': np.random.randint(0, 4, n_rows),
    'schoolsup': np.random.choice(['yes', 'no'], n_rows, p=[0.3, 0.7]),
    'famsup': np.random.choice(['yes', 'no'], n_rows, p=[0.6, 0.4]),
    'paid': np.random.choice(['yes', 'no'], n_rows, p=[0.4, 0.6]),
    'activities': np.random.choice(['yes', 'no'], n_rows, p=[0.6, 0.4]),
    'nursery': np.random.choice(['yes', 'no'], n_rows, p=[0.8, 0.2]),
    'higher': np.random.choice(['yes', 'no'], n_rows, p=[0.9, 0.1]),
    'internet': np.random.choice(['yes', 'no'], n_rows, p=[0.8, 0.2]),
    'romantic': np.random.choice(['yes', 'no'], n_rows, p=[0.3, 0.7]),
    'famrel': np.random.randint(1, 6, n_rows),
    'freetime': np.random.randint(1, 6, n_rows),
    'goout': np.random.randint(1, 6, n_rows),
    'Dalc': np.random.randint(1, 6, n_rows),
    'Walc': np.random.randint(1, 6, n_rows),
    'health': np.random.randint(1, 6, n_rows),
    'absences': np.random.randint(0, 30, n_rows)
}

# Create DataFrame
df = pd.DataFrame(data)

# Generate correlated grades (G1, G2, G3)
# Base grade influenced by studytime, failures, absences
base_grade = (
    15 - df['failures'] * 2 +
    df['studytime'] * 1.5 -
    df['absences'] * 0.1 +
    (df['Medu'] + df['Fedu']) * 0.5
)

# Add random noise
base_grade = base_grade + np.random.normal(0, 2, n_rows)

# Clip to reasonable range and round
df['G1'] = np.clip(base_grade, 0, 20).round().astype(int)
df['G2'] = np.clip(base_grade + np.random.normal(0.5, 1, n_rows), 0, 20).round().astype(int)
df['G3'] = np.clip(base_grade + np.random.normal(1, 1.5, n_rows), 0, 20).round().astype(int)

# Save to CSV
df.to_csv('student_performance_bigdata.csv', index=False)

print(f"Dataset created with {len(df)} rows and {len(df.columns)} columns")
print(f"File saved as 'student_performance_bigdata.csv'")
print("\nFirst few rows:")
print(df.head())
print("\nColumn names:")
print(df.columns.tolist())

Dataset created with 65000 rows and 33 columns
File saved as 'student_performance_bigdata.csv'

First few rows:
  school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0     GP   F   20       U     LE3       T     1     0  services  services   
1     MS   F   20       R     GT3       T     0     1   teacher  services   
2     MS   M   22       U     LE3       T     2     0   at_home   at_home   
3     GP   F   18       U     GT3       T     3     2     other    health   
4     GP   M   22       R     LE3       T     4     1     other   at_home   

   ... famrel freetime  goout  Dalc  Walc health absences  G1  G2  G3  
0  ...      1        2      2     1     4      4       15  18  18  18  
1  ...      5        4      5     4     2      1        5  20  20  20  
2  ...      2        1      5     5     4      4       22  17  20  18  
3  ...      3        4      2     5     2      5       25  20  20  20  
4  ...      4        2      3     1     3      3       10  18  20