In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Student Grade Prediction

In [None]:
import pandas as pd
data = pd.read_csv("../input/iitg-students-grade2/grade_prediction.csv")

In [None]:
pd.options.display.max_columns = None
data.head()

In [None]:
data.shape

In [None]:
pip install jellyfish

In [None]:
import jellyfish

In [None]:
# Final processing of String variables 
def top_or_not(value, top_values):
    if value in top_values:
        return value
    else:
        return jellyfish.soundex("other")

def string_col_preprocess(data, col, n):
    # Normal string preprocessing
    data[col] = data[col].str.lower().str.strip().fillna("none").replace({'-':"none", '--':'none', '---':'none'})
    
    # Creating a new column with soundex encoding
    data[col+"Sound"] = data[col].apply(lambda x: jellyfish.soundex(x))
    
    # Now taking only first n common values and making others as "others"
    top_values = list(data[col+'Sound'].value_counts().sort_values(ascending=False).keys()[:n])
    data[col+'Sound'] = data[col+'Sound'].apply(lambda x: top_or_not(x, top_values))
    
    return data

In [None]:
bekaar_features = ['Which Technical Clubs are you part of ?', 'Which Cultural Clubs are you part of?',
                  'Addiction?']

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='most_frequent')
le = LabelEncoder()
def preprocess_data(data):
    
    data.drop(['Timestamp', 'Survey taken by(CnA Member)'], axis=1, inplace=True)
    data = data.drop(bekaar_features, axis=1)
    
    # Removing entries with less than 5 CPI
    data = data.drop(data[data['CPI ']<5].index)
    data.dropna(axis=0, subset=['CPI '], inplace=True)
    data.rename(str.strip, axis='columns', inplace=True)
    
    data = data.replace({'-':0, '--':0, '---':0, '<':0, '>':0, 'Null':0, '12:00':2, 'O':0})
    # Sex feature
    data['Sex'] = data['Sex'].replace({'Male':0, 'Female':1})
    
    # Branch feature
    data['Branch'] = data['Branch'].replace({'Design':0,
                                         'CSE':1,
                                         'MC':2,
                                         'ECE/EEE':3,
                                         'ME':4,
                                         'CL':5,
                                         'EP':6,
                                         'CE':7,
                                         'CST':8,
                                         'BSBE':9})
    
    # Dropper feature
    data['Dropper?'] = data['Dropper?'].replace({'Yes':1, 'No':0})
    
    # 10th Board & 12th Board Feature
    data['10th Board'] = data['10th Board'].replace({'ICSE':2,
                                                 'CBSE':1,
                                                 'State':0})
    data['12th Board'] = data['12th Board'].replace({'ICSE':2,
                                                 'CBSE':1,
                                                 'State':0})
    
    #Coaching Feature
    data['Coaching'] = data['Coaching'].replace({'Yes':1, 'No':0})
    
    
    # Coaching City, Coaching Name, Home State, Home City
    #data = data.drop(['Coaching City', 'Coaching Name', 'Home State', 'Home City'], axis=1)
    for col in ['Coaching City', 'Coaching Name', 'Home State', 'Home City', 'Mom\'s Job', 'Dad\'s Job']:
        #for col in ['Coaching City', 'Coaching Name', 'Home City']:
        data = string_col_preprocess(data, col, 8)
        data = data.drop(col, axis=1)
        
        # Now replacing these values by descending order of mean (Label encoding by mean)
        replace_dict = {}
        for rank, key in enumerate(data.groupby(col+'Sound')['CPI'].mean().sort_values(ascending=False).keys()):
            replace_dict[key] = rank+1
        
        data[col+'Sound'] = data[col+'Sound'].replace(replace_dict)
    
    # Mom Dad Education
    data['Mom\'s Education'] = data['Mom\'s Education'].fillna('Post Graduate')
    data['Mom\'s Education'] = data['Mom\'s Education'].replace({'<10th Pass':0,
                                                                 '< 10th Pass':0,
                                                                 '10th Pass':1,
                                                                 '12th Pass':2,
                                                                 'Graduate':3,
                                                                 'Post Graduate':4})
    data['Dad\'s Education'] = data['Dad\'s Education'].fillna('Post Graduate')
    data['Dad\'s Education'] = data['Dad\'s Education'].replace({'<10th Pass':0,
                                                                 '< 10th Pass':0,
                                                                 '10th Pass':1,
                                                                 '12th Pass':2,
                                                                 'Graduate':3,
                                                                 'Post Graduate':4})
    
    # Mom dad Job and Hostel
    #for col in ['Mom\'s Job', 'Dad\'s Job', 'Hostel?']:
    for col in ['Hostel?']:
        data[col] = le.fit_transform(data[col])
    
    # Study Time Feature
    data['Study Time?'] = data['Study Time?'].fillna('Irregular')
    data['Study Time?'] = data['Study Time?'].replace({'Irregular':0,
                                                        'Everyday upto 0-2 hours':1,
                                                        'Everyday upto 2-4 hours':2})
    
    # Technical Club Feature
    #technical_dummy = data['Which Technical Clubs are you part of ?'].str.get_dummies(sep=', ')
    #data = pd.concat([data.drop('Which Technical Clubs are you part of ?', axis=1), technical_dummy], axis=1)
    
    # Cultural Club Feature
    #cult_dummy = data['Which Cultural Clubs are you part of?'].str.get_dummies(sep=', ')
    #data = pd.concat([data.drop('Which Cultural Clubs are you part of?', axis=1), cult_dummy], axis=1)
    
    # Fest Feature
    fest_dummy = data['Member of Fests\' organizing team?'].str.get_dummies(sep=', ')
    data = pd.concat([data.drop('Member of Fests\' organizing team?', axis=1), fest_dummy], axis=1)
    
    # Education Loan Feature
    data['Have you taken an educational loan?'] = data['Have you taken an educational loan?'].replace({'No':0, 'Yes':1})
    
    # Time Spent Outside Feature
    data['Time spent outside your room[except classes]? (daily average, in hours)'] = data['Time spent outside your room[except classes]? (daily average, in hours)'].fillna(4)
    
    
    #Attendance feature
    data['Attendance?'] = data['Attendance?'].replace({'Below 50?':50,
                                                       'Below 75?':62.5,
                                                       'Below 90?':87.5,
                                                       'Above 90?':95})
    
    # Relationship Feature
    data['Relationship status?'] = data['Relationship status?'].replace({'Committed':0,
                                                       'Complicated':0,
                                                       'Single':1})
    
    # Library Feature
    data['Library?'] = data['Library?'].replace({'Rarely':0,
                                                 'During Exams':1,
                                                 'Often':2})
    
    # Sleeping time
    """data['When do you sleep?'] = data['When do you sleep?'].replace({'Before 10 pm':0,
                                                                     'After 10 pm':1,
                                                                     'Around 12':2,
                                                                     'After 12 am':3,
                                                                     'Around 1':4,
                                                                     'After 2 am':5,
                                                                     '3':6,
                                                                     3:6,
                                                                     'Never':8,
                                                                     '6:30 am':8})"""
    data = data.drop("When do you sleep?", axis=1)
    
    # Sleeping Duration
    data['Sleep Duration(Hrs)?'] = data['Sleep Duration(Hrs)?'].replace({'<=4':-2,
                                                                     '5':-1,
                                                                     5:-1,
                                                                     '6':0,
                                                                     6:0,
                                                                     '7':1,
                                                                     7:1,
                                                                     '>=8':2})
    
    # Sleep in Day
    data['Do you sleep during the day?'] = data['Do you sleep during the day?'].replace({'Yes':1, 'No':0})
    
    # Addiction Feature
    #addiction_dummy = data['Addiction?'].str.get_dummies(sep=', ')
    #data = pd.concat([data.drop('Addiction?', axis=1), addiction_dummy], axis=1)
    
    # Group Study or Individual
    data['Group Study/Individual'] = data['Group Study/Individual'].replace({'Group Study':1, 'Individual':0})
    
    # Study Material Preferred
    data['Study Material Preferred'] = data['Study Material Preferred'].replace({'Online content':0, 'Books':1})
    
    # Core/NonCore
    data['Core/Non-Core'] = data['Core/Non-Core'].replace({'Core':0, 'Non-Core':1})
    
    
    # Missing Values
    #data = pd.DataFrame(imp.fit_transform(data), columns=data.columns)
    return data.fillna(0)

In [None]:
new_data = preprocess_data(data)

In [None]:
new_data.head()

In [None]:
from sklearn.model_selection import train_test_split
y = new_data['CPI']
X = new_data.drop(['CPI'], axis=1)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import PowerTransformer
from sklearn.linear_model import Ridge

kbest = SelectKBest(score_func=f_regression, k=13)

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
kbest = SelectKBest(score_func=f_regression, k=13)
kbest.fit(X, y)

X_new = kbest.transform(X)
rdg = Ridge(alpha=35)
#rf = RandomForestRegressor(min_samples_split=11, min_samples_leaf=12, n_estimators=500, max_depth=6)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rdg, X_new, y, cv=5)
print(scores)
print(scores.mean())

In [None]:
X.columns[kbest.get_support()]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
preprocess_transformer = FunctionTransformer(preprocess_data)

pipe = Pipeline([("preprocess", preprocess_transformer), ("kbest", kbest), ("model", rdg)])

In [0]:
import pandas as pd
data__. = pd.read_csv("../input/data-...csv")