### General Imports

In [None]:
!pip install pandas

In [None]:
import pandas as pd

In [None]:
path ="/content/drive/MyDrive/Personal details/Dataset/els_02_12_byf3stu_v1_0.csv"

df = pd.read_csv(path)
df.head(5)

In [None]:
import random

random.seed(10)

In [None]:
|# # Clustered heatmap for the entire correlation matrix
# plt.figure(figsize=(14, 12))
# sns.clustermap(df.corr(), cmap='coolwarm', annot=True, fmt=".2f")
# plt.title("Clustered Correlation Matrix Heatmap")
# plt.show()


# **Week 1**

In [None]:
# list the total number of features in this dataframe

num_features = len(df.columns)
print(f"Number of features: {num_features}")


In [None]:
# count total numbers of rows in this dataset

num_rows = len(df)
print(f"Number of rows: {num_rows}")


In [None]:
# display

df[['STU_ID', 'STRAT_ID', 'PSU','BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'F1TXMSTD' ]].head(5)


In [None]:
# Dictionary mapping values to labels
# This data conversion was done for the purpose of PowerBi analysis
sex_mapping = {1: 'male', 2: 'female', -4: 'non-respondent', -8: 'legit skip'}
race_mapping = {1: 'native', 2: 'asian', 3: 'black', 4: 'hispanic (no race)',
                5: 'hispanic (race)', 6: 'multi (non-hispanic)', 7: 'white',
                -4: 'non-respondent', -8: 'legit skip'}
lang_mapping = {0: 'no', 1: 'yes', -4: 'non-respondent', -8: 'legit skip', -9: 'missing'}


In [None]:
# Select specified columns and replace values
selected_df = df[['STU_ID', 'STRAT_ID', 'PSU', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'F1TXMSTD']].copy()
selected_df['BYSEX'] = selected_df['BYSEX'].replace(sex_mapping)
selected_df['BYRACE'] = selected_df['BYRACE'].replace(race_mapping)
selected_df['BYSTLANG'] = selected_df['BYSTLANG'].replace(lang_mapping)


In [None]:
# Display the first 5 rows of the new DataFrame
selected_df.head()


In [None]:
# Save the new DataFrame to a CSV file
selected_df.to_csv('selected_data_with_replaced_values.csv', index=False)

In [None]:
# Check for missing values in the entire DataFrame
missing_values = selected_df.isnull().sum()

# Display the count of missing values for each column
print("Missing Values:\n", missing_values)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Distribution of Gender
plt.figure(figsize=(8, 5))
sns.countplot(x='BYSEX', data=selected_df)
plt.title('Distribution of Gender')
plt.xlabel('Gender (BYSEX)')
plt.ylabel('Count')
plt.show()





In [None]:
# Ethnicity/Race Distribution
plt.figure(figsize=(10, 6))
selected_df['BYRACE'].value_counts().plot(kind='bar')
plt.title('Ethnicity/Race Distribution')
plt.xlabel('Ethnicity/Race (BYRACE)')
plt.ylabel('Count')
plt.show()




In [None]:
# English Proficiency
plt.figure(figsize=(8, 5))
sns.countplot(x='BYSTLANG', data=selected_df)
plt.title('Distribution of English Proficiency')
plt.xlabel('English Proficiency (BYSTLANG)')
plt.ylabel('Count')
plt.show()

# Linear Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [None]:
selected_df1 = df[['STU_ID', 'STRAT_ID', 'PSU', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'F1TXMSTD']].copy()

In [None]:
# selected features and target
X = selected_df1.drop(columns=['F1TXMSTD'])
y = selected_df1['F1TXMSTD']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

### Hyper Parameter Tuning and Random Forest Regressor, Very High mean square error (MSC), try another model

In [None]:
# Finding the best hyper parameters
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestRegressor

# # Define the model and the hyperparameter grid
# model = RandomForestRegressor()
# param_grid = {
#     'n_estimators': [50, 100, 150],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# # Create the GridSearchCV object
# grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# # Fit the grid search to the data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters
# print("Best Hyperparameters:", grid_search.best_params_)

# # Get the best model
# best_model = grid_search.best_estimator_

# # Make predictions with the best model
# best_predictions = best_model.predict(X_test)



Hyperparameter tuning was implemented to optimize the performance of a Random Forest Regressor in predicting F1TXMSTD. By adjusting key parameters like the number of estimators and tree depth, the goal was to enhance predictive accuracy, avoid overfitting or underfitting, improve generalization to new data, and boost computational efficiency. The process aimed to fine-tune the model for optimal results in handling the given dataset.


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Replace X_train, y_train with your actual training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Best hyperparameters
best_hyperparameters = {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}

# Create a RandomForestRegressor with the best hyperparameters
best_model = RandomForestRegressor(**best_hyperparameters, random_state=42)

# Train the model on the entire training dataset
best_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = best_model.predict(X_test)

# Evaluate the model's performance on the test set
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error on Test Set: {mse}')


Dataframe 2, which excludes STRAT_ID and PSU

In [None]:
selected_df2 = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'F1TXMSTD']].copy()

Mt Reference:

Student ID is composed of the 4-digit School ID (which consists of
the 3-digit Stratum and 1-digit PSU) and a 2-digit sequential
student code within school.
Stratum (STRAT_ID) and PSU are embedded in STU_ID for ease of use
in certain variance estimation programs

In [None]:
# selected features and target
X = selected_df2.drop(columns=['F1TXMSTD'])
y = selected_df2['F1TXMSTD']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)


In [None]:
# Make predictions on the test set
predictions = model.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Replace X_train, y_train with your actual training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Best hyperparameters
best_hyperparameters = {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}

# Create a RandomForestRegressor with the best hyperparameters
best_model = RandomForestRegressor(**best_hyperparameters, random_state=42)

# Train the model on the entire training dataset
best_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = best_model.predict(X_test)

# Evaluate the model's performance on the test set
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error on Test Set: {mse}')


# Week 2

# Variable Selection for Matching

In any analysis, selecting the right set of variables is crucial. Here, we identify and describe the key variables for matching in our dataset.

## 1. Identification Variables

### 1.1 STU_ID: Student ID (Unique)
- **Description**: Unique identifier for each student.
- **Role**: Used for individual-level identification.

### 1.2 SCH_ID: School ID (Restricted Data)
- **Description**: Identifier for the school (Note: Restricted Data).
- **Role**: Potentially useful for school-level matching.

### 1.3 STRAT_ID: Stratum ID
- **Description**: Identifier for the stratum.
- **Role**: Used to define strata for sampling.

### 1.4 PSU: Primary Sampling Unit (School)
- **Description**: Identifier for the primary sampling unit (school).
- **Role**: Used in the sampling design.

## 2. Demographic Variables

### 2.1 BYSEX: Gender
- **Description**: 1 = Male, 2 = Female, -4/-8 = Non-respondent/Legit Skip.
- **Role**: Consider for gender-based matching.

### 2.2 BYRACE: Race
- **Description**: Various codes for different racial categories.
- **Role**: Consider for race-based matching.

### 2.3 BYSTLANG: English Proficiency
- **Description**: 0 = No, 1 = Yes, -4/-8/-9 = Non-respondent/Legit Skip/Missing.
- **Role**: Consider for language-based matching.

### 2.4 YDOB_P: Year/Month of Birth
- **Description**: 4-digit year, 2-digit month.
- **Role**: May be useful for age-based matching.

## 3. Educational Variables

### 3.1 BYMOTHED: Mother's Education Level
- **Description**: 1-8 represent different education levels, -4/-8/-9 = Non-response/Legit Skip/Missing.
- **Role**: Consider for parental education-based matching.

### 3.2 BYFATHED: Father's Education Level
- **Description**: 1-8 represent different education levels, -4/-8/-9 = Non-response/Legit Skip/Missing.
- **Role**: Consider for parental education-based matching.

## 4. Academic Performance Variables

### 4.1 BYSES1: Socioeconomic Status
- **Description**: Ranges between -2.11 and 1.82, -4/-8 = Non-response/Legit Skip.
- **Role**: Consider for socioeconomic status-based matching.

### 4.2 BYTXMSTD: Math Score, 10th Grade
- **Description**: Range between 19.38 and 86.68, -8 = Legit Skip.
- **Role**: Consider for academic performance-based matching.

### 4.3 BYTXRSTD: Reading Score, 10th Grade
- **Description**: Range between 22.57 and 78.76, -8 = Legit Skip.
- **Role**: Consider for academic performance-based matching.

### 4.4 F1TXMSTD: Math Score, 12th Grade
- **Description**: Range between 19.82 and 79.85, -8 = Legit Skip.
- **Role**: Consider for academic performance-based matching.


## Manual Parameter optimisation

In [None]:
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

In [None]:
selected_df.head(5)

In [None]:
# Count the total number of rows in BYSEX column with values -4 or -8
non_responses_sex = selected_df[selected_df['BYSEX'].isin([-4, -8])].shape[0]

print(f'Total number of rows in BYSEX column with values -4 or -8: {non_responses_sex}')


In [None]:
# Replace -4 and -8 in BYSEX column with a common value (e.g., -999 for missing)
selected_df['BYSEX'] = selected_df['BYSEX'].replace([-4, -8], -999)

# Verify the changes
selected_df['BYSEX'].value_counts()


In [None]:
# Replace -4 and -8 in BYRACE column with a common value (e.g., -999 for missing)
selected_df['BYRACE'] = selected_df['BYRACE'].replace([-4, -8], -999)

# Verify the changes
selected_df['BYRACE'].value_counts()

In [None]:
# Replace -4, -8, -9 in BYSTLANG column with a common value (e.g., -999 for missing)
selected_df['BYSTLANG'] = selected_df['BYSTLANG'].replace([-4, -8, -9], -999)

# Verify the changes
selected_df['BYSTLANG'].value_counts()

In [None]:
# Replace -4, -8, -9 in BYMOTHED column with a common value (e.g., -999 for missing)
selected_df['BYMOTHED'] = selected_df['BYMOTHED'].replace([-4, -8, -9], -999)

# Verify the changes
selected_df['BYMOTHED'].value_counts()

In [None]:
# Replace -4, -8, -9 in BYFATHED column with a common value (e.g., -999 for missing)
selected_df['BYFATHED'] = selected_df['BYFATHED'].replace([-4, -8, -9], -999)

# Verify the changes
selected_df['BYFATHED'].value_counts()

In [None]:
# Replace -4, -8, -9 in BYSES1 column with a common value (e.g., -999 for missing)
selected_df['BYSES1'] = selected_df['BYSES1'].replace([-4, -8, -9], 0)

# Verify the changes
selected_df['BYSES1'].value_counts()

In [None]:
# Verify the changes
selected_df['BYTXMSTD'].value_counts()

In [None]:
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

In [None]:
import numpy as np

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

#  df is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Separate features and target variable
X = selected_df.drop('F1TXMSTD', axis=1)
y = selected_df['F1TXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_imputed, y_train)

# Make predictions on the test set
predictions = model.predict(X_test_imputed)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')



## Linear Regression after Dropping NaN rows


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG','BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

num_rows = len(selected_df_cleaned)
print(f"Number of rows: {num_rows}")

# Separate features and target variable
X = selected_df_cleaned.drop('F1TXMSTD', axis=1)
y = selected_df_cleaned['F1TXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


What's surprising is , without Student ID accuracy is still 81%

In [None]:
from sklearn.metrics import r2_score

# Calculate R-squared
r_squared = r2_score(y_test, predictions)

print(f'R-squared: {r_squared}')


### Hyper Paramter Tuning, GridSearch CV

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer

# Assuming 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG','BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)


# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

# Separate features and target variable
X = selected_df_cleaned.drop('F1TXMSTD', axis=1)
y = selected_df_cleaned['F1TXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = LinearRegression()

# Define hyperparameters to search
param_grid = {
    'fit_intercept': [True, False],
    'positive': [False],  # 'positive' is a hyperparameter for LinearRegression
    'copy_X': [True, False]
}

# Define the scoring metric (e.g., mean squared error)
scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, scoring=scoring, cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions on the test set using the best model
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)

# Evaluate the best model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error of the Best Model: {mse}')


In [None]:
# Calculate R-squared
r_squared = r2_score(y_test, predictions)
print(f'R-squared: {r_squared}')

## Random Forest Regressor

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Assuming 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG','BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

# Separate features and target variable
X = selected_df_cleaned.drop('BYTXMSTD', axis=1)
y = selected_df_cleaned['BYTXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the Random Forest Regressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')


In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameters to search
param_grid = {
    'n_estimators': [200, 300, 500],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],

}


# Perform GridSearchCV
grid_search = GridSearchCV(rf_model, param_grid, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

# Make predictions on the test set using the best model
best_rf_model = grid_search.best_estimator_
best_predictions = best_rf_model.predict(X_test)

# Evaluate the best model
best_mse = mean_squared_error(y_test, best_predictions)
print(f'Mean Squared Error of the Best Model: {best_mse}')


1. Best Hyperparameters: {'max_depth': 10, 'n_estimators': 200}

Fitting 5 folds for each of 9 candidates, totalling 45 fits

Mean Squared Error of the Best Model: 20.649908255679446

2. Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 500}

Fitting 5 folds for each of 81 candidates, totalling 405 fits

Mean Squared Error of the Best Model: 20.405364924774876

## Neural Networks

In [None]:
import pandas as pd
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers

# Assuming 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG','BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

# Separate features and target variable
X = selected_df_cleaned.drop('BYTXMSTD', axis=1)
y = selected_df_cleaned['BYTXMSTD']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the neural network model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(1)  # Output layer for regression
])

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model on the test set
mse, mae = model.evaluate(X_test, y_test)
print(f'Mean Squared Error: {mse}, Mean Absolute Error: {mae}')


### Linear activation function

# Week 3


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with 6000 features
# Select a subset of features for better visualization
subset_features = df.sample(n=20, axis=1, random_state=42)

# Calculate the correlation matrix
correlation_matrix = subset_features.corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix Heatmap (Subset of Features)")
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with 6000 features
# Calculate the correlation matrix with respect to the target variable
correlation_with_target = df.corrwith(df['F1TXMSTD'])

# Create a DataFrame for the correlation values
correlation_df = pd.DataFrame({'Correlation with F1TXMSTD': correlation_with_target})

# Create a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_df.transpose(), annot=True, cmap='coolwarm', fmt=".2f", cbar=False)
plt.title("Correlation with F1TXMSTD - Heatmap")
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame with 6000 features
# Calculate the correlation with the target variable
correlation_with_target = df.corrwith(df['F1TXMSTD']).abs()

# Select the top 10 features with the highest correlation
top_features = correlation_with_target.nlargest(30).index

# Extract the corresponding subset of the DataFrame
subset_features = df[top_features]

# Calculate the correlation matrix for the selected features
correlation_matrix = subset_features.corr()

# Create a heatmap
plt.figure(figsize=(18, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Top 10 Features with Highest Correlation with F1TXMSTD - Heatmap")
plt.show()


In [None]:
import pandas as pd

# Assuming 'df' is your DataFrame with 6000 features
# Calculate the correlation with the target variable
correlation_with_target = df.corrwith(df['F1TXMSTD'])

# Create a DataFrame for the correlation values
correlation_df = pd.DataFrame({'Feature': correlation_with_target.index, 'Correlation with F1TXMSTD': correlation_with_target.abs()})

# Sort the DataFrame by absolute correlation values in descending order
sorted_correlation_df = correlation_df.sort_values(by='Correlation with F1TXMSTD', ascending=False)

# Display the top 10 correlations
top_correlations = sorted_correlation_df.head(100)
print("Top 10 Features with Highest Correlation with F1TXMSTD:")
print(top_correlations)


In [None]:
import pandas as pd

# Calculate the correlation with the target variable
correlation_with_target = df.corrwith(df['F1TXMSTD'])

# Extract the correlation value for the specific feature 'XXXXXXX'
correlation_xxx = correlation_with_target['BYTX1MPP']

print(f"Correlation of 'F1NELS2M' with 'F1TXMSTD': {correlation_xxx}")


In [None]:
# Display 10 random values from the 'F1NELS2M' column
random_values = df['F1NELS2M'].head(10)
print("10 Random Values from 'F1NELS2M':")
print(random_values)


## Linear Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

num_rows = len(selected_df_cleaned)
print(f"Number of rows: {num_rows}")

# Separate features and target variable
X = selected_df_cleaned.drop('F1TXMSTD', axis=1)
y = selected_df_cleaned['F1TXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Add a new column 'Predicted_F1TXMSTD' to your DataFrame
selected_df_cleaned['Predicted_F1TXMSTD'] = model.predict(X)

# Display the DataFrame with the predicted values
print(selected_df_cleaned.head())

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# 'selected_df_cleaned' is your DataFrame with the predicted values
## selected_df_cleaned.to_csv('output_file.csv', index=False)



In [None]:
# 'selected_df_cleaned' is your DataFrame
correlation_matrix = selected_df_cleaned.corr()

# Display the correlation matrix
print(correlation_matrix)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()


In [None]:
# 'correlation_matrix' is your correlation matrix
#correlation_matrix.to_csv('correlation_matrix.csv')



**Linear Regression:**

Formula: \(y = mx + b\)

- \(y\) is the dependent variable (target),
- \(x\) is the independent variable (predictor),
- \(m\) is the slope of the line, and
- \(b\) is the y-intercept.

**Basic Explanation:** Linear regression models the relationship between a single independent variable and a dependent variable as a straight line. The slope (\(m\)) represents the change in the dependent variable for a one-unit change in the independent variable.

**Multiple Regression:**

Formula: \(y = b_0 + b_1x_1 + b_2x_2 + \ldots + b_nx_n\)

- \(y\) is the dependent variable,
- \(x_1, x_2, \ldots, x_n\) are the independent variables,
- \(b_0\) is the y-intercept,
- \(b_1, b_2, \ldots, b_n\) are the coefficients representing the change in \(y\) for a one-unit change in the corresponding \(x\) variable.

**Basic Explanation:** Multiple regression extends the concept of linear regression to include multiple independent variables. It models the relationship between the dependent variable and multiple predictors. Each coefficient (\(b_i\)) represents the change in the dependent variable while holding other variables constant.

In both cases, the goal is to find the values of the coefficients that minimize the difference between the predicted values and the actual values of the dependen


## Multiple Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

num_rows = len(selected_df_cleaned)
print(f"Number of rows: {num_rows}")

# Separate features and target variable
X = selected_df_cleaned.drop('F1TXMSTD', axis=1)
y = selected_df_cleaned['F1TXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# Train the Multiple Regression model
multiple_regression_model = LinearRegression()
multiple_regression_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = multiple_regression_model.predict(X_test)

# Add a new column 'Predicted_F1TXMSTD' to your DataFrame
selected_df_cleaned['Predicted_F1TXMSTD'] = multiple_regression_model.predict(X)

# Display the DataFrame with the predicted values
print(selected_df_cleaned.head())

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# 'selected_df_cleaned' is your DataFrame with the predicted values
## selected_df_cleaned.to_csv('output_file.csv', index=False)


In [None]:
# Calculate R-squared score
r2 = r2_score(y_test, predictions)
print(f'R-squared Score: {r2}')

In [None]:

# 'selected_df_cleaned' is your DataFrame
correlation_matrix = selected_df_cleaned.corr()

# Display the correlation matrix

print(correlation_matrix)



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plotting a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the absolute difference between actual and predicted values
selected_df_cleaned['Abs_Difference'] = np.abs(selected_df_cleaned['F1TXMSTD'] - selected_df_cleaned['Predicted_F1TXMSTD'])

# Color-coding based on the difference
selected_df_cleaned['Color'] = pd.cut(selected_df_cleaned['Abs_Difference'],
                                     bins=[0, 5, 10, 20, 100],
                                     labels=['±5', '±10', '±20', '>20'])

# Visualize the difference between actual and predicted values using a scatter plot with color-coding
plt.figure(figsize=(10, 6))
scatter_plot = sns.scatterplot(x='F1TXMSTD', y='Predicted_F1TXMSTD', hue='Color', data=selected_df_cleaned, palette=['green', 'yellow', 'orange', 'red'])
plt.title('Actual vs. Predicted F1TXMSTD with Color-Coding')
plt.xlabel('Actual F1TXMSTD')
plt.ylabel('Predicted F1TXMSTD')
plt.legend(title='Difference')
plt.show()

# Get count of points in each category
category_counts = selected_df_cleaned['Color'].value_counts()
print("\nCount of points in each category:")
print(category_counts)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'\nMean Squared Error: {mse}')


##New Feature - Multipel Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

#BYOCCHS - Occupation right after high school-coded
#F1NELS2M - Mathematics—NELS-equated 12th-grade estimated

# 'df' is your DataFrame
selected_df = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD','BYOCCHS', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
selected_df.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
selected_df_cleaned = selected_df.dropna()

num_rows = len(selected_df_cleaned)
print(f"Number of rows: {num_rows}")

# Separate features and target variable
X = selected_df_cleaned.drop('F1TXMSTD', axis=1)
y = selected_df_cleaned['F1TXMSTD']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Multiple Regression model
multiple_regression_model = LinearRegression()
multiple_regression_model.fit(X_train, y_train)

# Make predictions on the test set
predictions = multiple_regression_model.predict(X_test)

# Add a new column 'Predicted_F1TXMSTD' to your DataFrame
selected_df_cleaned['Predicted_F1TXMSTD'] = multiple_regression_model.predict(X)

# Display the DataFrame with the predicted values
print(selected_df_cleaned.head())

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

# 'selected_df_cleaned' is your DataFrame with the predicted values
## selected_df_cleaned.to_csv('output_file.csv', index=False)


In [None]:
# Calculate R-squared score
r2 = r2_score(y_test, predictions)
print(f'R-squared Score: {r2}')

In [None]:

# 'selected_df_cleaned' is your DataFrame
correlation_matrix = selected_df_cleaned.corr()

# Display the correlation matrix

print(correlation_matrix)



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the absolute difference between actual and predicted values
selected_df_cleaned['Abs_Difference'] = np.abs(selected_df_cleaned['F1TXMSTD'] - selected_df_cleaned['Predicted_F1TXMSTD'])

# Color-coding based on the difference
selected_df_cleaned['Color'] = pd.cut(selected_df_cleaned['Abs_Difference'],
                                     bins=[0, 5, 10, 20, 100],
                                     labels=['±5', '±10', '±20', '>20'])

# Visualize the difference between actual and predicted values using a scatter plot with color-coding
plt.figure(figsize=(10, 6))
scatter_plot = sns.scatterplot(x='F1TXMSTD', y='Predicted_F1TXMSTD', hue='Color', data=selected_df_cleaned, palette=['green', 'yellow', 'orange', 'red'])
plt.title('Actual vs. Predicted F1TXMSTD with Color-Coding')
plt.xlabel('Actual F1TXMSTD')
plt.ylabel('Predicted F1TXMSTD')
plt.legend(title='Difference')
plt.show()

# Get count of points in each category
category_counts = selected_df_cleaned['Color'].value_counts()
print("\nCount of points in each category:")
print(category_counts)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'\nMean Squared Error: {mse}')


#Week 4

## Multiple Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data for training, excluding 'F1TXMSTD'
training_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD', 'BYTXMSTD']].copy()
# Replace -4, -8, -9 with NaN
training_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
training_data_cleaned = training_data.dropna()

#training_data_cleaned.drop('F1TXMSTD', axis=1, inplace=True)

# Separate features and target variable
#X_train = training_data_cleaned.drop('BYTXMSTD', axis=1)
X_train = training_data_cleaned
y_train = training_data_cleaned['BYTXMSTD']

# Train the Multiple Regression model
multiple_regression_model = LinearRegression()
multiple_regression_model.fit(X_train, y_train)

# Now, load the data for prediction, excluding 'F1TXMSTD'
prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD','BYTXMSTD','BYTXMSTD', 'F1TXMSTD']].copy()
#prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
prediction_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
prediction_data_cleaned = prediction_data.dropna()

#assign the var
actual_f1txmstd = prediction_data_cleaned['F1TXMSTD']

#drop traget
prediction_data_cleaned.drop('F1TXMSTD', axis=1, inplace=True)

# Use the trained model to make predictions on 'F1TXMSTD'
predictions_f1txmstd = multiple_regression_model.predict(prediction_data_cleaned.drop('BYTXMSTD', axis=1))
#predictions_f1txmstd = multiple_regression_model.predict(prediction_data_cleaned)

# Add the predicted values to the DataFrame
prediction_data_cleaned['Predicted_F1TXMSTD'] = predictions_f1txmstd

# Display the DataFrame with the predicted values
#print(prediction_data_cleaned.head())

# Save the DataFrame with predictions to a CSV file if needed
# prediction_data_cleaned.to_csv('predicted_output_file.csv', index=False)


In [None]:
# 'F1TXMSTD' is the actual target variable in the DataFrame

# Extract the actual 'F1TXMSTD' values
#actual_f1txmstd = prediction_data_cleaned['F1TXMSTD']

# Extract the predicted 'F1TXMSTD' values
predicted_f1txmstd = prediction_data_cleaned['Predicted_F1TXMSTD']

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(actual_f1txmstd, predicted_f1txmstd)
print(f'Mean Squared Error (MSE): {mse}')

# Calculate R-squared (R2) score
r2 = r2_score(actual_f1txmstd, predicted_f1txmstd)
print(f'R-squared (R2) Score: {r2}')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Now, load the data for prediction, excluding 'F1TXMSTD'
prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD','BYOCCHS','F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
prediction_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
prediction_data_cleaned = prediction_data.dropna()

prediction_data_cleaned['Predicted_F1TXMSTD'] = predictions_f1txmstd

# Calculate the absolute difference between actual and predicted values
prediction_data_cleaned['Abs_Difference'] = np.abs(prediction_data_cleaned['F1TXMSTD'] - prediction_data_cleaned['Predicted_F1TXMSTD'])

# Color-coding based on the difference
prediction_data_cleaned['Color'] = pd.cut(prediction_data_cleaned['Abs_Difference'],
                                     bins=[0, 5, 10, 20, 100],
                                     labels=['±5', '±10', '±20', '>20'])

# Visualize the difference between actual and predicted values using a scatter plot with color-coding
plt.figure(figsize=(10, 6))
scatter_plot = sns.scatterplot(x='F1TXMSTD', y='Predicted_F1TXMSTD', hue='Color', data=prediction_data_cleaned, palette=['green', 'yellow', 'orange', 'red'])
plt.title('Actual vs. Predicted F1TXMSTD with Color-Coding')
plt.xlabel('Actual F1TXMSTD')
plt.ylabel('Predicted F1TXMSTD')
plt.legend(title='Difference')
plt.show()

# Get count of points in each category
category_counts = prediction_data_cleaned['Color'].value_counts()
print("\nCount of points in each category:")
print(category_counts)




## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor

training_data_knn = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYOCCHS', 'BYTXRSTD', 'BYTXMSTD']].copy()

# Replace -4, -8, -9 with NaN
training_data_knn.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
training_data_knn_cleaned = training_data_knn.dropna()

# Separate features and target variable
X_train_knn = training_data_knn_cleaned.drop('BYTXMSTD', axis=1)
#X_train_knn = training_data_knn_cleaned
y_train_knn = training_data_knn_cleaned['BYTXMSTD']

knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors (n_neighbors) as needed
knn_model.fit(X_train_knn, y_train_knn)

#prediction_data_knn = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYOCCHS', 'BYTXRSTD', 'BYTXMSTD', 'F1TXMSTD']].copy()
prediction_data_knn = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYOCCHS', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
prediction_data_knn.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
prediction_data_knn_cleaned = prediction_data_knn.dropna()

# Assign the target variable
actual_f1txmstd_knn = prediction_data_knn_cleaned['F1TXMSTD']

# Drop target variable
prediction_data_knn_cleaned.drop('F1TXMSTD', axis=1, inplace=True)

predictions_f1txmstd_knn = knn_model.predict(prediction_data_knn_cleaned)

# Add the predicted values to the DataFrame
prediction_data_knn_cleaned['Predicted_F1TXMSTD'] = predictions_f1txmstd_knn

# Display the DataFrame with the predicted values
#print(prediction_data_knn_cleaned.head())



In [None]:
# Extract the predicted 'F1TXMSTD' values
predicted_f1txmstd_knn = prediction_data_knn_cleaned['Predicted_F1TXMSTD']

# Calculate Mean Squared Error (MSE)
mse_knn = mean_squared_error(actual_f1txmstd_knn, predicted_f1txmstd_knn)
print(f'Mean Squared Error (MSE) for KNN: {mse_knn}')

# Calculate R-squared (R2) score
r2_knn = r2_score(actual_f1txmstd_knn, predicted_f1txmstd_knn)
print(f'R-squared (R2) Score for KNN: {r2_knn}')


## SVM

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data for training, excluding 'F1TXMSTD'
training_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD', 'BYTXMSTD']].copy()

# Replace -4, -8, -9 with NaN
training_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
training_data_cleaned = training_data.dropna()

# Separate features and target variable
X_train = training_data_cleaned.drop('BYTXMSTD', axis=1)
#X_train = training_data_cleaned
y_train = training_data_cleaned['BYTXMSTD']

# Train the Support Vector Regression model
svm_model = SVR()
svm_model.fit(X_train, y_train)

# Now, load the data for prediction, excluding 'F1TXMSTD'
#prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD','BYTXMSTD', 'F1TXMSTD']].copy()
prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
prediction_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
prediction_data_cleaned = prediction_data.dropna()

# Assign the variable
actual_f1txmstd = prediction_data_cleaned['F1TXMSTD']

# Drop target
prediction_data_cleaned.drop('F1TXMSTD', axis=1, inplace=True)

# Use the trained model to make predictions on 'F1TXMSTD'
predictions_f1txmstd = svm_model.predict(prediction_data_cleaned)

# Add the predicted values to the DataFrame
prediction_data_cleaned['Predicted_F1TXMSTD'] = predictions_f1txmstd

# Display the DataFrame with the predicted values
#print(prediction_data_cleaned.head())

# 'F1TXMSTD' is the actual target variable in the DataFrame

# Extract the actual 'F1TXMSTD' values
#actual_f1txmstd = prediction_data_cleaned['F1TXMSTD']

# Extract the predicted 'F1TXMSTD' values
predicted_f1txmstd = prediction_data_cleaned['Predicted_F1TXMSTD']



In [None]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(actual_f1txmstd, predicted_f1txmstd)
print(f'Mean Squared Error (MSE) for SVM: {mse}')

# Calculate R-squared (R2) score
r2 = r2_score(actual_f1txmstd, predicted_f1txmstd)
print(f'R-squared (R2) Score for SVM: {r2}')

## Random Forest Regressor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data for training, excluding 'F1TXMSTD'
training_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD', 'BYTXMSTD']].copy()

# Replace -4, -8, -9 with NaN
training_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
training_data_cleaned = training_data.dropna()

# Separate features and target variable
X_train = training_data_cleaned.drop('BYTXMSTD', axis=1)
#X_train = training_data_cleaned
y_train = training_data_cleaned['BYTXMSTD']

# Train the Random Forest Regressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)

# Now, load the data for prediction, excluding 'F1TXMSTD'
#prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD','BYTXMSTD', 'F1TXMSTD']].copy()
prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1','BYOCCHS', 'BYTXRSTD', 'BYTXMSTD', 'F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
prediction_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
prediction_data_cleaned = prediction_data.dropna()

# Assign the variable
actual_f1txmstd = prediction_data_cleaned['F1TXMSTD']
actual_BYTXMSTD = prediction_data_cleaned['BYTXMSTD']

# Drop target
prediction_data_cleaned.drop('F1TXMSTD', axis=1, inplace=True)
prediction_data_cleaned.drop('BYTXMSTD', axis=1, inplace=True)


# Use the trained model to make predictions on 'F1TXMSTD'
predictions_f1txmstd = rf_model.predict(prediction_data_cleaned)

#Using best model to make predictions
#predictions_f1txmstd = best_rf_model.predict(prediction_data_cleaned)

# Add the predicted values to the DataFrame
prediction_data_cleaned['Predicted_F1TXMSTD'] = predictions_f1txmstd

# Display the DataFrame with the predicted values
#print(prediction_data_cleaned.head())

# 'F1TXMSTD' is the actual target variable in the DataFrame

# Extract the actual 'F1TXMSTD' values
#actual_f1txmstd = prediction_data_cleaned['F1TXMSTD']

# Extract the predicted 'F1TXMSTD' values
predicted_f1txmstd = prediction_data_cleaned['Predicted_F1TXMSTD']

# Add the actual 'F1TXMSTD' values back to the DataFrame
prediction_data_cleaned['F1TXMSTD'] = actual_f1txmstd
prediction_data_cleaned['BYTXMSTD'] = actual_BYTXMSTD




In [None]:
print(prediction_data_cleaned.head(5))

In [None]:
#CSV of RFR
prediction_data_cleaned.to_csv('predicted_output_RFR_02_22.csv', index=False)


### Best Hyper Paramter

In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV

# # Example hyperparameter tuning
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['auto', 'sqrt', 'log2']
# }

# rf_model = RandomForestRegressor()
# grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error')
# grid_search.fit(X_train, y_train)

# # Best hyperparameters
# best_params = grid_search.best_params_
# print(f'Best Hyperparameters: {best_params}')

# # Use the best model
# best_rf_model = grid_search.best_estimator_


In [None]:
# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(actual_f1txmstd, predicted_f1txmstd)
print(f'Mean Squared Error (MSE) for Random Forest: {mse}')

# Calculate R-squared (R2) score
r2 = r2_score(actual_f1txmstd, predicted_f1txmstd)
print(f'R-squared (R2) Score for Random Forest: {r2}')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Now, load the data for prediction, excluding 'F1TXMSTD'
prediction_data = df[['STU_ID', 'BYSEX', 'BYRACE', 'BYSTLANG', 'BYDOB_P', 'BYMOTHED', 'BYFATHED', 'BYSES1', 'BYTXMSTD', 'BYTXRSTD','BYOCCHS','F1TXMSTD']].copy()

# Replace -4, -8, -9 with NaN
prediction_data.replace([-4, -8, -9], np.nan, inplace=True)

# Drop rows with missing values
prediction_data_cleaned = prediction_data.dropna()

prediction_data_cleaned['Predicted_F1TXMSTD'] = predictions_f1txmstd

# Calculate the absolute difference between actual and predicted values
prediction_data_cleaned['Abs_Difference'] = np.abs(prediction_data_cleaned['F1TXMSTD'] - prediction_data_cleaned['Predicted_F1TXMSTD'])

# Color-coding based on the difference
prediction_data_cleaned['Color'] = pd.cut(prediction_data_cleaned['Abs_Difference'],
                                     bins=[0, 5, 10, 20, 100],
                                     labels=['±5', '±10', '±20', '>20'])

# Visualize the difference between actual and predicted values using a scatter plot with color-coding
plt.figure(figsize=(10, 6))
scatter_plot = sns.scatterplot(x='F1TXMSTD', y='Predicted_F1TXMSTD', hue='Color', data=prediction_data_cleaned, palette=['green', 'yellow', 'orange', 'red'])
plt.title('Actual vs. Predicted F1TXMSTD with Color-Coding')
plt.xlabel('Actual F1TXMSTD')
plt.ylabel('Predicted F1TXMSTD')
plt.legend(title='Difference')
plt.show()

# Get count of points in each category
category_counts = prediction_data_cleaned['Color'].value_counts()
print("\nCount of points in each category:")
print(category_counts)


