# Classification

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

**Splitting the data into train and test**

In [None]:
from sklearn.model_selection import train_test_split

# Shuffle and split the data into training and testing subsets
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.20, random_state=42)


**Vectorizing the data**
- Input: data (typically a list of texts or sentences
- Output:  a matrix of size (m,n) with m the number of data instances and n the nb of features (typically the corpus tokens)

In [None]:
# Using TFIDF vectorizer to convert convert words to Vector Space
vectorizer = TfidfVectorizer(max_features=8000, use_idf=True, stop_words='english', tokenizer=nltk.word_tokenize, ngram_range=(1, 3))

# Fit the vectorizer to train and test data
# fit_transform computes the scaling parameters (mu, rho) on the training data and scales 
# the training data.
X_train_vec = vectorizer.fit_transform(X_train)
# transform scales the test data using the scaling parameters computed on the training data.
X_test_vec = vectorizer.transform(X_test)


**Printing out the features**

In [None]:
features = vectorizer.get_feature_names()
print(features)

**Training and testing a classifier**

In [None]:
from sklearn.linear_model import Perceptron

# Create an object of the class *Perceptron*
clf = Perceptron()

# Learn/train  the model
# The model is trained on (input,output) pairs 
# Input:  X_train_vec (the vectorized input texts) 
# Output: y_train (the labels)
clf.fit( X_train_vec, y_train )

# Predict the labels of the test instances
y_pred = clf.predict( X_test_vec )

# Print the gold and predicted labels
# the gold labels come from the dataset (these are the classes associated with each input)
print( "y true:", Y_test )
# the predicted labels are produced by the classifier
print( "y pred:", y_pred )

**Evaluating a classifier**

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Print the accuracy
print( "Acc:", accuracy_score(Y_test, y_pred ) )

# Print the classification report
print('Classification report:', classification_report(Y_test, y_pred ))

# Print the confusion matrix
print('Confusion matrix:', confusion_matrix(Y_test, y_pred ) )


**Examining the model**

In [None]:
# Save the vocabulary into a variable
vocab = vectorizer.vocabulary_
print( "Vocabulary size:", len(vocab) )


ix_to_tokens = { v:k for k,v in vocab.items() }

# Save the weights in a dict key = index, value = weight
features_weights = {i:w for (i,w) in enumerate( clf.coef_[0] ) }

# Sort and print the list of weights
sorted_weights = sort_dict(features_weights)
print( sorted_weights )

# Reverse dictionnaries for labels and vocabulary
# tag_to_idx = {class_name:class_idx,} e.g. {drama:1,comedy:0}
ix_to_tag = { v:k for k,v in tag_to_ix.items() }

# Look at the best features for each class
print( '\nBest features for identifying class 1, ie', ix_to_tag[1])
print( '\n'.join( [':'.join( (ix_to_tokens[i],str(w)) )for (w,i) in reversed( sorted_weights[-6:] )] ) )

print( '\nBest features for identifying class 0, ie', ix_to_tag[0])
print( '\n'.join( [':'.join( (ix_to_tokens[i],str(w)) ) for (w,i) in sorted_weights[:6]] ) )

**Feature selection**

The Chi-square test is used in statistics to test the independence of two events. In feature selection, we use it to test whether the occurrence of a specific term and the occurrence of a specific class are independent.

In [None]:
# Load libraries
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# N features with highest chi-squared statistics are selected
chi2_features = SelectKBest(chi2, k = can be any number)
X = chi2_features.fit_transform(X, y)

In [None]:
sel = SelectKBest(chi2, k=5000)  # feature selection
sel.fit(X_train,y_train)
X_train = sel.transform(X_train)
X_test = sel.transform(X_test)

**SKLearn pipeline object**

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)
correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))