In [1]:

import pandas as pd
import sklearn
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from pymongo import MongoClient

# Data Processing

In [3]:
# connect to mongodb
mongo = MongoClient(port=27017)

for dbname in mongo.list_database_names():
    if dbname == "texasSchoolsDB":
        print(f'{dbname} <----------')
    else:
        print(dbname)

admin
autosaurus
classDB
config
epa
fruits_db
garden_db
local
met
petsitly_marketing
test
texasSchoolsDB <----------
travel_db
uk_food


In [4]:
db = mongo["texasSchoolsDB"]
db.list_collection_names()

['demographics',
 'scores_coordinates',
 'scores_finances',
 'scores_finances_coordinates',
 'schools_2022_to_2023_geojson',
 'staar20_21',
 'DISTRICTS_INFO_2020_21',
 'account_ratings22',
 'current_districts_geojson',
 'tor20_21',
 'coordinates',
 'school_info',
 'teachers20_21',
 'demographic20_21']

In [6]:
# get the account_ratings22' collection from mongodb and convert it to a pandas dataframe
collection = db.account_ratings22
ratings = pd.DataFrame(list(collection.find()))

In [7]:
ratings.columns.to_list()

['_id',
 'DistrictNumber',
 'District',
 'Region',
 'County',
 'SchoolType',
 'AlternativeEducationAccountability',
 'Charter',
 'Number ofStudents',
 '%EconomicallyDisadvantaged',
 '% EB/ELStudents',
 'OverallRating',
 'OverallScore',
 'StudentAchievementRating',
 'StudentAchievementScore',
 'SchoolProgressRating',
 'SchoolProgressScore',
 'AcademicGrowthRating',
 'AcademicGrowthScore',
 'RelativePerformanceRating',
 'RelativePerformanceScore',
 'Closingthe Gaps Rating',
 'Closingthe Gaps Score',
 'DistinctionPostsecondaryReadiness',
 'CampusNumber',
 'Campus',
 'GradesServed',
 'DistinctionELA/Reading',
 'DistinctionMathematics',
 'DistinctionScience',
 'DistinctionSoc Studies',
 'DistinctionProgress',
 'DistinctionClosing the Gaps',
 'US CongressDistrict',
 'US CongressElection District',
 'TX HouseDistrict',
 'TX HouseElection District',
 'TX SenateDistrict',
 'TX SenateElection District',
 'Campus EnrollmentType',
 'SupportLabel',
 'PublicEducationGrant']

In [8]:
# Drop the non-beneficial columns 
ratings = ratings.drop(columns=['_id',
 'DistrictNumber',
 'District',
 'Region',
 'County',
 'SchoolType',
 'AlternativeEducationAccountability',
 'Charter',
 'StudentAchievementRating',
 'SchoolProgressRating',
 'AcademicGrowthRating',
 'RelativePerformanceRating',
 'Closingthe Gaps Rating',
 'CampusNumber',
 'Campus',
 'GradesServed',
 'DistinctionELA/Reading',
 'DistinctionScience',
 'DistinctionSoc Studies',
 'DistinctionProgress',
 'DistinctionClosing the Gaps',
 'US CongressDistrict',
 'US CongressElection District',
 'TX HouseDistrict',
 'TX HouseElection District',
 'TX SenateDistrict',
 'TX SenateElection District',
 'Campus EnrollmentType',
 'SupportLabel',
 'PublicEducationGrant'])

In [9]:
ratings.columns.to_list()

['Number ofStudents',
 '%EconomicallyDisadvantaged',
 '% EB/ELStudents',
 'OverallRating',
 'OverallScore',
 'StudentAchievementScore',
 'SchoolProgressScore',
 'AcademicGrowthScore',
 'RelativePerformanceScore',
 'Closingthe Gaps Score',
 'DistinctionPostsecondaryReadiness',
 'DistinctionMathematics']

In [10]:
ratings.dropna()

Unnamed: 0,Number ofStudents,%EconomicallyDisadvantaged,% EB/ELStudents,OverallRating,OverallScore,StudentAchievementScore,SchoolProgressScore,AcademicGrowthScore,RelativePerformanceScore,Closingthe Gaps Score,DistinctionPostsecondaryReadiness,DistinctionMathematics
1,169,34.3%,1.2%,A,96,94,90,90,88,100,Earned,Earned
2,159,39.6%,0.6%,B,84,86,82,74,82,79,Earned,Not Earned
3,246,45.9%,1.2%,A,91,92,90,82,90,87,Earned,Earned
5,307,34.5%,1.3%,B,89,92,86,73,86,81,Not Earned,Not Earned
6,268,43.7%,1.9%,B,83,86,84,77,84,76,Not Earned,Not Earned
...,...,...,...,...,...,...,...,...,...,...,...,...
10166,407,81.3%,2.0%,C,79,56,81,81,58,74,Not Earned,Not Earned
10168,231,84.0%,1.7%,B,81,57,85,85,59,72,Not Earned,Not Earned
10169,365,87.4%,3.0%,Not Rated: Senate Bill 1365,61,60,66,.,66,50,Not Earned,Not Earned
10171,204,84.8%,5.9%,B,87,82,91,91,87,76,Not Earned,Earned


In [11]:
# Conver to Numeric
columns_to_convert = ['Number ofStudents',
 '%EconomicallyDisadvantaged',
 '% EB/ELStudents',
 'OverallScore',
 'StudentAchievementScore',
 'SchoolProgressScore',
 'AcademicGrowthScore',
 'RelativePerformanceScore',
 'Closingthe Gaps Score']

# Drop commas and percent signs, then convert selected columns to numeric
ratings[columns_to_convert] = ratings[columns_to_convert].replace({',': '', '%': ''}, regex=True).apply(pd.to_numeric, errors='coerce')

In [12]:
#Convert Categorical Data
ratings = pd.get_dummies(ratings, columns=[
 'DistinctionPostsecondaryReadiness',
 'DistinctionMathematics'])

In [13]:
ratings =  ratings.dropna()

In [14]:
ratings['OverallRating'].value_counts()

B                               4199
A                               2516
C                               1590
Not Rated: Senate Bill 1365      508
Not Rated: Data Under Review       1
Name: OverallRating, dtype: int64

In [15]:
# remove one under review school
ratings = ratings[ratings['OverallRating'] != 'Not Rated: Data Under Review']

In [16]:
#check removal
ratings['OverallRating'].value_counts()

B                              4199
A                              2516
C                              1590
Not Rated: Senate Bill 1365     508
Name: OverallRating, dtype: int64

# Compile, Train, and Evaluate Model

In [17]:
# Remove 'DISTRICT ACCOUNTABILITY RATINGS' target from features data
X = ratings.drop(columns=['OverallRating']) 
y = ratings['OverallRating']  

# Split the data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variables
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

print(X_train_scaled.shape)

(6609, 13)


In [18]:
# Define the model
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(units=80, activation="relu", input_dim=13))
# Add more hidden layers
model.add(tf.keras.layers.Dense(units=100, activation='relu'))
model.add(tf.keras.layers.Dense(units=50, activation='relu'))
model.add(tf.keras.layers.Dense(units=10, activation='relu'))

# Add the output layer
model.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))
# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# Print model summary
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [22]:
model.fit(X_train_scaled, y_train_encoded, epochs=100, batch_size=32, validation_split=0.2)

Epoch 1/100


[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 959us/step - accuracy: 0.7595 - loss: 0.4066 - val_accuracy: 0.7595 - val_loss: 0.4393
Epoch 2/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 782us/step - accuracy: 0.7548 - loss: 0.4184 - val_accuracy: 0.7610 - val_loss: 0.4383
Epoch 3/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 768us/step - accuracy: 0.7604 - loss: 0.4202 - val_accuracy: 0.7602 - val_loss: 0.4385
Epoch 4/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 762us/step - accuracy: 0.7595 - loss: 0.4125 - val_accuracy: 0.7519 - val_loss: 0.4448
Epoch 5/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 746us/step - accuracy: 0.7592 - loss: 0.3975 - val_accuracy: 0.7602 - val_loss: 0.4381
Epoch 6/100
[1m166/166[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 769us/step - accuracy: 0.7692 - loss: 0.3903 - val_accuracy: 0.7610 - val_loss: 0.4384
Epoch 7/100
[1m166/16

<keras.src.callbacks.history.History at 0x1e653ffc370>

In [23]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test_encoded,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

69/69 - 0s - 596us/step - accuracy: 0.7600 - loss: 0.4125
Loss: 0.41251739859580994, Accuracy: 0.7599818706512451


In [24]:
#save NN
model.save("TEA_Accountability.h5")

