In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

class HyperParameterTuning:
    """
    Contains methods to return a pipeline object and a dictionary containing
    classifier parameters
    """

    def __init__(self, classifier, vectorizer):
        """
        Args:
            classifier (One of 6 sklearn classifier objects): 'logreg', 'svm', 'nb',
                                                              'knn', 'xgboost', 'randomforests'
            vectorizer (CountVectorizer or TfidfVectorizer): Type of vector space model
        Returns:
            pipeline (sklearn pipeline object): Returns a pipeline object which is used
                                                by GridSearchCV
            model_params[self.classifier] (dict): Returns a dictionary of parameters
                                                  for the specified type of classifier
        """

        self.classifier = classifier
        self.vectorizer = vectorizer

    def get_pipeline(self):
        """
        Args:
            classifier (One of 6 sklearn classifier objects): 'logreg', 'svm', 'nb',
                                                              'knn', 'xgboost', 'randomforests'
            vectorizer (CountVectorizer or TfidfVectorizer): Type of vector space model
        Returns:
            pipeline (sklearn pipeline object): Returns a pipeline object which is
                                                used by GridSearchCV
            model_params[self.classifier] (dict): Returns a dictionary of parameters
                                                  for the specified type of classifier
        """

        classifier_objects = {'nb': MultinomialNB()}
        pipeline = Pipeline([('vect', self.vectorizer),
                             ('clf', classifier_objects[self.classifier])])

        return pipeline

    def get_params(self):
        """
        Args:
            self
        Returns:
            model_params[self.classifier] (dict): Returns a dictionary of parameters for the
                                                  specified type of classifier
        """
        model_params = {'nb': {'clf__alpha': (0, 1), 'clf__fit_prior': (True, False)}}
        return model_params[self.classifier]


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer, WordNetLemmatizer


class VectorSpace:
    """
    Creates vector space model for training data with specifications of weighting factors,
    reductions, stop words and ngram combination
    """

    def __init__(self, train, weighting_factor=None, reduction=None,
                 stop_words=None, ngrams=None):

        """
        Args:
            train (Pandas DataFrame): the training data
            weighting_factor (Optional argument, None by default, str otherwise)):
                                can take 'TF' or 'IDF'
            reduction (Optional argument, None by default, str otherwise):
                                can take 'stem' or 'lemmatize'
            stop_words (Optional argument, None by default, str otherwise):
                                can take 'english'
            ngrams (Optional argument, None by default, tuple otherwise):
                                can take (1, 1), (1, 2) or (2,2)
        Returns:
            vectorizer (CountVectorizer or TfidfVectorizer object)
            train (Pandas DataFrame): the training data with reduction applied (if any)
        """
        print("Parameters recieved: ", weighting_factor, reduction, stop_words, ngrams)
        self.train = train
        self.weighting_factor = weighting_factor
        self.stop_words = stop_words
        self.ngrams = ngrams
        self.reduction = reduction

    def lemmatize_sentences(self, sentence):
        """
        Args:
            sentence (str): A single sentence from a Pandas DataFrame
                            and applied the reduction (if any)
        Returns:
            lemmatized_tokens (str): A single sentence from a Pandas DataFrame
                            with the reduction applied (if any)
        """
        lemmatizer = WordNetLemmatizer()
        tokens = sentence.split()
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
        return ' '.join(lemmatized_tokens)

    def stem_sentences(self, sentence):

        """
        Args:
            sentence (str): A single sentence from a Pandas DataFrame
                            and applied the reduction (if any)
        Returns:
            stemmed_tokens (str): A single sentence from a Pandas DataFrame
                            with the reduction applied (if any)
        """

        porter_stemmer = PorterStemmer()
        tokens = sentence.split()
        stemmed_tokens = [porter_stemmer.stem(token) for token in tokens]
        return ' '.join(stemmed_tokens)

    def apply_reduction(self):

        """
        Args:
            self
        Returns:
            train (Pandas DataFrame): Returns the train data instance with the reduction
                                      applied (if any)
        """

        if self.reduction == 'stem':
            print("Performing reduction: stemming")
            self.train = self.train.apply(self.stem_sentences)
        elif self.reduction == 'lemmatize':
            print("Performing reduction: lemmatization")
            self.train = self.train.apply(self.lemmatize_sentences)
        return self.train

    def tf_vectorizer(self):

        """
        Args:
            self
        Returns:
            vectorizer (CountVectorizer object)
            train (Pandas DataFrame): the training data with reduction applied (if any)
        """

        self.train = self.apply_reduction()
        print("Returning CountVectorizer object with parameters: ", self.stop_words, self.ngrams)
        vectorizer = CountVectorizer(stop_words=self.stop_words, ngram_range=self.ngrams)
        return vectorizer, self.train

    def tfidf_vectorizer(self):

        """
        Args:
            self
        Returns:
            vectorizer (TfidfVectorizer object)
            train (Pandas DataFrame): the training data with reduction applied (if any)
        """

        self.train = self.apply_reduction()
        print("Returning TfidfVectorizer object with parameters: ", self.stop_words, self.ngrams)
        vectorizer = TfidfVectorizer(stop_words=self.stop_words, ngram_range=self.ngrams)
        return vectorizer, self.train

    def create_vec_space(self):

        """
        Args:
            self
        Returns:
            vectorizer (TfidfVectorizer object)
        """

        if self.weighting_factor == 'TF':
            return self.tf_vectorizer()
        return self.tfidf_vectorizer()


In [None]:
import itertools
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split




def tune_fit_model(transformed_data, parameter_desc, train):
    """
    Args:
        transformed_data (list of lists): Every list, has two components.
                                          1. vectorizer: which is either CountVectorizer or
                                             TfidfVectorizer
                                          2. reduced_data: has the training set with
                                             reduction such as stemming or lemmatization
                                             applied
        parameter_desc (list of lists): For each list in transformed_data, the corresponding list
                                        in the same index have description about the data so it's
                                        easy to plot graphs or sort once model accuracies have been
                                        calculated
        train (Pandas DataFrame): contains the training data including the labels
    Returns:
         None
    """

    count = 0
    params = []
    for vectorizer, reduced_data in transformed_data:

        for classifier in {'nb'}:
            print("Data: ", parameter_desc[count])
            print("Classfier: ", classifier)
            x_train, x_test, y_train, y_test = train_test_split(reduced_data,
                                                                train['Score'],
                                                                stratify=train['Score'],
                                                                test_size=0.15)
            hyperparam_instance = HyperParameterTuning(classifier, vectorizer)
            search = GridSearchCV(hyperparam_instance.get_pipeline(),
                                  param_grid=hyperparam_instance.get_params(),
                                  cv=5, scoring='accuracy',
                                  n_jobs=-1)
            search.fit(x_train, y_train)
            y_pred = search.predict(x_test)
            print("Validation accuracy", accuracy_score(y_test, y_pred))
            print("Best parameter (CV score=%0.3f):" % search.best_score_)
            print(search.best_params_)
            param = [parameter_desc[count], classifier, search.best_score_,
                     accuracy_score(y_test, y_pred), search.best_params_]
            params.append(param)
            print(param)
            print()
        count += 1

    param_df = pd.DataFrame(params, columns=['Dataset', 'Classifier', 'Training accuracy',
                                             'Validation accuracy', 'Classifier object'])
    
    print(param_df)
    #param_df.to_csv("/Users/abhishekbabuji/Desktop/ModelPerformances.csv")


def main():
    """
    The main function that reads in the train.csv from my local file system
    """

    train = pd.read_csv(data_location)
    datasets = [[train['Text']], ['TF', 'TFIDF'], ['stem', 'lemmatize', None], ['english', None],
                [(1, 1), (1, 2), (2, 2)]]
    dataset_combination = list(itertools.product(*datasets))

    transformed_data = []
    parameter_desc = []
    count = 0

    for dataset_params in dataset_combination:
        count += 1
        print("Transformed data no. ", count)
        model = VectorSpace(dataset_params[0], dataset_params[1], dataset_params[2],
                            dataset_params[3], dataset_params[4])
        vectorizer, reduced_data = model.create_vec_space()

        transformed_data.append([vectorizer, reduced_data])
        parameter_desc.append([dataset_params[1], dataset_params[2],
                               dataset_params[3], dataset_params[4]])
        print("Vector space transformation applied with parameters: ",
              dataset_params[1], dataset_params[2], dataset_params[3], dataset_params[4])
        print()

    tune_fit_model(transformed_data, parameter_desc, train)


if __name__ == '__main__':

    data_location = "" #Enter data location here
    main()
