### Import required libraries

In [None]:
# start with common imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# import the packages for machine learning pipelines
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from fast_ml.model_development import train_valid_test_split
from xgboost import XGBClassifier

### Read in the data

In [None]:
data = pd.read_csv('../data/crime_data.csv')

data.shape

In [None]:
data.columns

### Natural Language Processing
We will execute a few techniques with the text data to make it ready for the model. We will use the following techniques:
- Tokenization - converting text into tokens
- Removing Stopwords - removing common words that will likely appear in any text
- Lemmatization - converting words to their base form
- Stemming - reducing words to their root form
- n-grams - grouping words together (for example, instead of having "good" and "movie" as separate tokens, we can have "good movie" as one token)

### First we will create custom transformers to perform these tasks

In [None]:
# import required packages
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

# download the required resources
nltk.download('punkt')
nltk.download('wordnet')

# create the stemmer class
class Stemmer(BaseEstimator, TransformerMixin): 
    def __init__(self): 
        self.stemmer = PorterStemmer() 

    def fit(self, X, y=None): 
        return self

    def transform(self, X, y=None): 
        return [' '.join([self.stemmer.stem(word) for word in word_tokenize(text)]) for text in X]

# create the lemmatizer class
class Lemmatizer(BaseEstimator, TransformerMixin): 
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer() 

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return [' '.join([self.lemmatizer.lemmatize(word) for word in word_tokenize(text)]) for text in X]

### Split the data into training, validation and test sets

In [None]:

# Split the data
X = data.drop('target', axis=1)
y = data['target']

# define the train, validation and test size
train_size = 0.7
valid_size = 0.1
test_size = 0.2

X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(data, target = 'target', 
                                                                            train_size=train_size, valid_size=valid_size, test_size=test_size)

### Encode the target variable

In [None]:
# encode the target variable

# instantiate the label encoder
le = LabelEncoder()

# fit and transform the target variable
y_train = le.fit_transform(y_train)
y_valid = le.transform(y_valid)
y_test = le.transform(y_test)

### Categorize features

In [None]:
# Define the text and numeric columns
txt = ['text_column1', 'text_column2', 'text_column3']
num = ['numeric_column1', 'numeric_column2']
cat = ['categorical_column1', 'categorical_column2']

### Build the Machine Learning Pipeline

### First Step to Pre-Proccess the Data

In [None]:
# Text pipeline
text_pipeline = Pipeline([
    ('stemmer', Stemmer()),
    ('lemmatizer', Lemmatizer()),
    ('imputer', SimpleImputer(strategy='constant', fill_value='')),
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=1000, ngram_range=(1, 2), max_df=10, min_df=5)),
    ('scaler', StandardScaler(with_mean=False))
])

# define column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
            ('scaler', StandardScaler())
        ]), num),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='')),
            ('onehot', OneHotEncoder())
        ]), cat),
        ('txt', text_pipeline, txt)
    ])

# Define the model
clf = XGBClassifier()

# Create the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', clf)])


### Set up the Parameters for the GridSearchCV and the Model

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
from tqdm import tqdm

# Define the parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 4, 5]
}

# Create a list of dictionaries, each containing one combination of parameters
param_list = list(ParameterGrid(param_grid))

# Create a progress bar
pbar = tqdm(total=len(param_list))

# Define the grid search
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=8, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Close the progress bar
pbar.close()