In [4]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('college').getOrCreate()

In [5]:
from google.colab import files
uploaded = files.upload()

Saving College.csv to College.csv


In [19]:
df = spark.read.csv('College.csv', inferSchema = True, header = True)

In [20]:
df.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)



In [21]:
df.head(1)

[Row(School='Abilene Christian University', Private='Yes', Apps=1660, Accept=1232, Enroll=721, Top10perc=23, Top25perc=52, F_Undergrad=2885, P_Undergrad=537, Outstate=7440, Room_Board=3300, Books=450, Personal=2200, PhD=70, Terminal=78, S_F_Ratio=18.1, perc_alumni=12, Expend=7041, Grad_Rate=60)]

In [22]:
from pyspark.ml.feature import VectorAssembler

In [23]:
df.columns

['School',
 'Private',
 'Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

# **Assess correlations between the features and the target**

In [18]:
from scipy.stats import shapiro

feature_cols = ['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate']

results = {}

for col in feature_cols:

    sampled_data = df.select(col).dropna().sample(fraction = 0.1, seed = 42).toPandas()[col]

    if len(sampled_data) < 3:
        print(f"{col}: data size is too small")
        continue

    stat, p_value = shapiro(sampled_data)
    results[col] = (stat, p_value)

    print(f"{col} - Shapiro-Wilk: {stat:.4f}, p-value: {p_value:.4f}")

    if p_value > 0.05:

        print(f"  => ✅ {col} follows a normal distribution.\n")

    else:

        print(f"  => {col} dows not follow a normal distribution.\n")

Apps - Shapiro-Wilk: 0.7669, p-value: 0.0000
  => Apps dows not follow a normal distribution.

Accept - Shapiro-Wilk: 0.7382, p-value: 0.0000
  => Accept dows not follow a normal distribution.

Enroll - Shapiro-Wilk: 0.7068, p-value: 0.0000
  => Enroll dows not follow a normal distribution.

Top10perc - Shapiro-Wilk: 0.9348, p-value: 0.0004
  => Top10perc dows not follow a normal distribution.

Top25perc - Shapiro-Wilk: 0.9741, p-value: 0.0963
  => ✅ Top25perc follows a normal distribution.

F_Undergrad - Shapiro-Wilk: 0.6959, p-value: 0.0000
  => F_Undergrad dows not follow a normal distribution.

P_Undergrad - Shapiro-Wilk: 0.4545, p-value: 0.0000
  => P_Undergrad dows not follow a normal distribution.

Outstate - Shapiro-Wilk: 0.9461, p-value: 0.0018
  => Outstate dows not follow a normal distribution.

Room_Board - Shapiro-Wilk: 0.9813, p-value: 0.2772
  => ✅ Room_Board follows a normal distribution.

Books - Shapiro-Wilk: 0.6420, p-value: 0.0000
  => Books dows not follow a normal

In [24]:
from scipy.stats import ttest_ind, mannwhitneyu, shapiro

for col in feature_cols:
    pdf = df.select(col, 'Private').dropna().toPandas()

    group_private = pdf[pdf['Private'] == 'Yes'][col]
    group_public = pdf[pdf['Private'] == 'No'][col]

    pval_private = shapiro(group_private)[1]
    pval_public = shapiro(group_public)[1]

    if pval_private > 0.05 and pval_public > 0.05:

        t_stat, p_val = ttest_ind(group_private, group_public, equal_var=False)

        test_name = "t-test"

    else:

        u_stat, p_val = mannwhitneyu(group_private, group_public, alternative='two-sided')

        test_name = "Mann-Whitney"

    print(f"{col}: {test_name} p-value = {p_val:.4g}")

Apps: Mann-Whitney p-value = 1.829e-39
Accept: Mann-Whitney p-value = 1.658e-42
Enroll: Mann-Whitney p-value = 1.196e-56
Top10perc: Mann-Whitney p-value = 2.941e-08
Top25perc: Mann-Whitney p-value = 0.006531
F_Undergrad: Mann-Whitney p-value = 2.026e-66
P_Undergrad: Mann-Whitney p-value = 2.329e-52
Outstate: Mann-Whitney p-value = 8.311e-63
Room_Board: Mann-Whitney p-value = 5.632e-20
Books: Mann-Whitney p-value = 0.001812
Personal: Mann-Whitney p-value = 6.406e-21
PhD: Mann-Whitney p-value = 8.582e-05
Terminal: Mann-Whitney p-value = 0.002743
S_F_Ratio: Mann-Whitney p-value = 3.789e-45
perc_alumni: Mann-Whitney p-value = 1.048e-33
Expend: Mann-Whitney p-value = 2.457e-23
Grad_Rate: Mann-Whitney p-value = 4.384e-23


# **Select predictors**

In [13]:
assembler = VectorAssembler(inputCols = ['Apps',
 'Accept',
 'Enroll',
 'Top10perc',
 'Top25perc',
 'F_Undergrad',
 'P_Undergrad',
 'Outstate',
 'Room_Board',
 'Books',
 'Personal',
 'PhD',
 'Terminal',
 'S_F_Ratio',
 'perc_alumni',
 'Expend',
 'Grad_Rate'], outputCol = 'features')

In [14]:
output = assembler.transform(df)

# **Change the target into numbers**

In [25]:
from pyspark.ml.feature import StringIndexer

In [26]:
indexer = StringIndexer(inputCol = 'Private', outputCol = 'PrivateIndex')

In [27]:
fixed_output = indexer.fit(output).transform(output)

In [28]:
fixed_output.printSchema()

root
 |-- School: string (nullable = true)
 |-- Private: string (nullable = true)
 |-- Apps: integer (nullable = true)
 |-- Accept: integer (nullable = true)
 |-- Enroll: integer (nullable = true)
 |-- Top10perc: integer (nullable = true)
 |-- Top25perc: integer (nullable = true)
 |-- F_Undergrad: integer (nullable = true)
 |-- P_Undergrad: integer (nullable = true)
 |-- Outstate: integer (nullable = true)
 |-- Room_Board: integer (nullable = true)
 |-- Books: integer (nullable = true)
 |-- Personal: integer (nullable = true)
 |-- PhD: integer (nullable = true)
 |-- Terminal: integer (nullable = true)
 |-- S_F_Ratio: double (nullable = true)
 |-- perc_alumni: integer (nullable = true)
 |-- Expend: integer (nullable = true)
 |-- Grad_Rate: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- PrivateIndex: double (nullable = false)



In [29]:
final_data = fixed_output.select('features', 'PrivateIndex')

# **Develop classification models**

In [30]:
train_data, test_data = final_data.randomSplit([0.7, 0.3])

In [33]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

In [34]:
from pyspark.ml import Pipeline

In [37]:
dtc_model = DecisionTreeClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')
rfc_model = RandomForestClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')
gbt_model = GBTClassifier(labelCol = 'PrivateIndex', featuresCol = 'features')

In [39]:
fitted_dtc_model = dtc_model.fit(train_data)
fitted_rfc_model = rfc_model.fit(train_data)
fitted_gbt_model = gbt_model.fit(train_data)

In [40]:
dtc_preds = fitted_dtc_model.transform(test_data)
rfc_preds = fitted_rfc_model.transform(test_data)
gbt_preds = fitted_gbt_model.transform(test_data)

In [41]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [42]:
evaluation = BinaryClassificationEvaluator(labelCol = 'PrivateIndex')

In [45]:
print('Decision Tree Classifier')
evaluation.evaluate(dtc_preds)

Decision Tree Classifier


0.9609826589595375

In [46]:
print('Random Forest Classifier')
evaluation.evaluate(rfc_preds)

Random Forest Classifier


0.9825626204238919

In [48]:
gbt_evaluation = BinaryClassificationEvaluator(labelCol = 'PrivateIndex', rawPredictionCol='prediction')

In [49]:
print('Gradient Boosting Tree Classifier')
print(gbt_evaluation.evaluate(gbt_preds))

Gradient Boosting Tree Classifier
0.9297687861271675
