# AIcrowd Runtime Configuration

In [None]:
import os

# Please use the absolute for the location of the dataset.
# Or you can use relative path with `os.getcwd() + "test_data/test.csv"`
AICROWD_DATASET_PATH = os.getenv("DATASET_PATH", os.getcwd()+"/data/data.csv")
AICROWD_OUTPUTS_PATH = os.getenv("OUTPUTS_DIR", "")
AICROWD_ASSETS_DIR = os.getenv("ASSETS_DIR", "assets")

# Install packages

In [None]:
!pip install --upgrade scikit-learn gensim
!pip install -q -U aicrowd-cli

# Define preprocessing code

In [None]:
from glob import glob
import os
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import sklearn

# Training phase

No train process

In [None]:
!pip install aicrowd-cli
API_KEY = ""
!aicrowd login --api-key $API_KEY
!mkdir data
!aicrowd dataset download --challenge feature-engineering -j 3 -o data

# Prediction phase

In [None]:
test_dataset = pd.read_csv(AICROWD_DATASET_PATH)

from gensim.parsing.preprocessing import remove_stopwords
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(max_features = 512, ngram_range=(1, 3))
X_train_counts = count_vect.fit_transform([remove_stopwords(i) for i in test_dataset.text.tolist()])

from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf = np.round(X_train_tf.toarray()*5).astype(int)

test_dataset.feature = [str(i) for i in X_train_tf.tolist()]

test_dataset.to_csv(os.path.join(AICROWD_OUTPUTS_PATH,'submission.csv'), index=False)

# Submit to AIcrowd

In [None]:
!DATASET_PATH=$AICROWD_DATASET_PATH \
aicrowd -v notebook submit \
    --assets-dir $AICROWD_ASSETS_DIR \
    --challenge nlp-feature-engineering