In [1]:
import pandas as pd

df = pd.read_csv('exams.csv')
df.columns = [c.replace(' ','_') for c in df.columns]
originalFeatures = df.columns
print('originalFeatures count', len(originalFeatures))
print('originalFeatures', originalFeatures)
print(df.head())

originalFeatures count 8
originalFeatures Index(['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')
   gender race/ethnicity parental_level_of_education         lunch  \
0    male        group A                 high school      standard   
1  female        group D            some high school  free/reduced   
2    male        group E                some college  free/reduced   
3    male        group B                 high school      standard   
4    male        group E          associate's degree      standard   

  test_preparation_course  math_score  reading_score  writing_score  
0               completed          67             67             63  
1                    none          40             59             55  
2                    none          59             60             50  
3                    none          77             78             68  


In [2]:
df.isnull().sum()

gender                         0
race/ethnicity                 0
parental_level_of_education    0
lunch                          0
test_preparation_course        0
math_score                     0
reading_score                  0
writing_score                  0
dtype: int64

In [3]:
df.describe()

Unnamed: 0,math_score,reading_score,writing_score
count,1000.0,1000.0,1000.0
mean,66.396,69.002,67.738
std,15.402871,14.737272,15.600985
min,13.0,27.0,23.0
25%,56.0,60.0,58.0
50%,66.5,70.0,68.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [4]:
df.test_preparation_course.value_counts()

none         665
completed    335
Name: test_preparation_course, dtype: int64

In [5]:
335/665 #the data is slightly imbalanced but it's ok.

0.5037593984962406

In [6]:
X = df[['reading_score','math_score']]
y = df.test_preparation_course

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled[:3]

array([[-0.13591401,  0.03923309],
       [-0.67902695, -1.714564  ],
       [-0.61113783, -0.48041049]])

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, random_state=10)

In [9]:
X_train.shape

(750, 2)

In [10]:
X_test.shape

(250, 2)

In [11]:
y_train.value_counts()

none         499
completed    251
Name: test_preparation_course, dtype: int64

In [12]:
251/499

0.503006012024048

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=10)
scores

array([0.61, 0.64, 0.51, 0.55, 0.6 , 0.53, 0.53, 0.58, 0.49, 0.6 ])

In [14]:
scores.mean()

0.564

In [15]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(base_estimator = DecisionTreeClassifier(),
                 n_estimators = 100,
                  max_samples=0.8,
                  oob_score=True,
                  random_state=0
                 )
bag_model.fit(X_train, y_train)
bag_model.oob_score_

0.5666666666666667

In [16]:
bag_model.score(X_test, y_test)

0.616

In [17]:
bag_model = BaggingClassifier(base_estimator = DecisionTreeClassifier(),
                 n_estimators = 100,
                  max_samples=0.8,
                  oob_score=True,
                  random_state=0
                 )

cross_val_score(bag_model, X, y, cv=10)
scores.mean()

0.564

In [18]:
from sklearn.ensemble import RandomForestClassifier

scores = cross_val_score(RandomForestClassifier(), X, y, cv = 10)
scores.mean()

0.609

In [19]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
weak_learner = DecisionTreeClassifier(max_depth = 3)
clf = AdaBoostClassifier(base_estimator=weak_learner, n_estimators=100)
clf.fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: ", accuracy)

Accuracy:  0.6


In [20]:
from apyori import apriori
min_support = 0.5

min_math = min(df['math_score'])
max_math = max(df['math_score'])
min_read = min(df['reading_score'])
max_read = max(df['reading_score'])
min_write = min(df['writing_score'])
max_write = max(df['writing_score'])

math_normalized = [(x - min_math) / (max_math - min_math) for x in df['math_score']]
read_normalized = [(x - min_read) / (max_read - min_read) for x in df['reading_score']]
write_normalized = [(x - min_write) / (max_write - min_write) for x in df['writing_score']]

df['math_score'] = math_normalized
df['reading_score'] = read_normalized
df['writing_score'] = write_normalized

print(df.columns)
df = pd.get_dummies(df, columns=['gender','race/ethnicity','parental_level_of_education','lunch','test_preparation_course'])
print(df)

data = df.values.tolist()

itemsets = apriori(data, min_support=min_support, min_confidence=0.7, max_length=3)

for itemset in itemsets:
    print(itemset)

print("*********************")

for itemset in itemsets:
    for rule in itemset.ordered_statistics:
        antecedent = rule.items_base
        consequent = rule.items_add
        confidence = rule.confidence
        print(f"{antecedent} -> {consequent} (conf: {confidence:.3f})")

Index(['gender', 'race/ethnicity', 'parental_level_of_education', 'lunch',
       'test_preparation_course', 'math_score', 'reading_score',
       'writing_score'],
      dtype='object')
     math_score  reading_score  writing_score  gender_female  gender_male  \
0      0.620690       0.547945       0.519481              0            1   
1      0.310345       0.438356       0.415584              1            0   
2      0.528736       0.452055       0.350649              0            1   
3      0.735632       0.698630       0.584416              0            1   
4      0.747126       0.630137       0.584416              0            1   
..          ...            ...            ...            ...          ...   
995    0.689655       0.589041       0.545455              0            1   
996    0.827586       0.876712       0.896104              0            1   
997    0.218391       0.109589       0.233766              1            0   
998    0.689655       0.643836       0.7662