# Text classification with Sklearn

https://reintech.io/blog/how-to-create-a-text-classification-model-with-scikit-learn

In [125]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import _stop_words
from nltk.stem import SnowballStemmer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn import feature_extraction, pipeline, linear_model, metrics

from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

import pandas as pd
import numpy as np

from geotext import GeoText

## Test NLP ( MoltinomailNB ) News Group

In [8]:
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
print("Number of documents: ", len(newsgroups.data))
print("Number of categories: ", len(newsgroups.target_names))

Number of documents:  18846
Number of categories:  20


In [9]:
stemmer = SnowballStemmer('english')
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(stop_words='english', analyzer=stemmed_words)
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [17]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 84.32%


In [18]:
y_pred

array([ 9, 12, 14, ...,  0, 15, 14])

## NLP ( MoltinomailNB )

In [77]:
df = pd.read_csv("Sentences.csv")

In [79]:
stemmer = SnowballStemmer('french')
analyzer = CountVectorizer().build_analyzer()

def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

vectorizer = CountVectorizer(stop_words='french', analyzer=stemmed_words)
X = vectorizer.fit_transform(df["sentence"])
y = df.iloc[:, 1] + ' to ' + df.iloc[:, 2]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [80]:
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 1.75%


In [81]:
df_pred = df[-y_pred.size:]
df_pred.loc[df_pred.index, "prediction"] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [82]:
df_pred

Unnamed: 0,sentence,departure,arrival,prediction
1600,Je veux prendre le train de Clermont-Ferrand à...,Clermont-Ferrand,Saint-Denis,Nantes to Saint-Denis
1601,Le train de Paris à Rouen offre une vue magnif...,Paris,Rouen,Limoges to Le Mans
1602,Je prévois de voyager de Rouen à Marseille en ...,Rouen,Marseille,Marseille to Saint-Étienne
1603,"Pour mes prochaines vacances, je vais de Bézie...",Béziers,Tourcoing,Le Havre to Le Mans
1604,Je souhaite découvrir Calais en prenant le tra...,Nîmes,Calais,Tourcoing to Nîmes
...,...,...,...,...
1995,Je veux prendre le train de Bordeaux à La Roch...,Bordeaux,La Rochelle,Bordeaux to Limoges
1996,Le train est le meilleur moyen de se rendre de...,Nice,Lille,Tourcoing to Nîmes
1997,"Allons de Pau à Toulon en train, qu'en penses-...",Pau,Toulon,Calais to Tourcoing
1998,Je souhaite découvrir Marseille en prenant le ...,Orléans,Marseille,Toulon to Lille


## NLP ( SVC & RandomForest )

In [112]:
df = pd.read_csv("Sentences.csv")

# Create lists to store sentences, departure cities, and arrival cities
all_sentences = df["sentence"]
all_departure_cities = df["departure"]
all_arrival_cities = df["arrival"]

### With detection of cities with geotext

In [126]:
# Define a function to extract cities from a sentence
def extract_cities_from_sentence(sentence):
    places = GeoText(sentence)
    cities = list(places.cities)
    if len(cities) >= 2:
        return cities[0], cities[1]
    elif len(cities) == 1:
        return cities[0], None
    else:
        return None, None

# Apply the extract_cities_from_sentence function to each row in the DataFrame
df[["city_1", "city_2"]] = df["sentence"].apply(lambda x: pd.Series(extract_cities_from_sentence(x)))

In [159]:
# Data preprocessing: Tokenization, lowercasing, and punctuation removal
all_sentences = [f"{sentence}, {city1}, {city2}" for sentence, city1, city2 in df[["sentence", "city_1", "city_2"]].values]
all_sentences = [sentence.lower().replace(r'[^\w\s]', '') for sentence in all_sentences]

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(all_sentences)

# Label encoding for departure and arrival cities
y_departure_encoded = all_departure_cities
y_arrival_encoded = all_arrival_cities

# Split the data into training and testing sets
X_train, X_test, y_departure_train, y_departure_test = train_test_split(
    X, y_departure_encoded, test_size=0.2, random_state=42)
X_train, X_test, y_arrival_train, y_arrival_test = train_test_split(
    X, y_arrival_encoded, test_size=0.2, random_state=42)

# Train a Support Vector Machine (SVM) classifier for departure cities
departure_classifier = SVC(C=0.1, kernel='linear')
departure_classifier.fit(X_train, y_departure_train)

# Train a Random Forest classifier for arrival cities
arrival_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
arrival_classifier.fit(X_train, y_arrival_train)

# Make predictions on the test data
y_departure_pred = departure_classifier.predict(X_test)
y_arrival_pred = arrival_classifier.predict(X_test)

# Calculate accuracy
departure_accuracy = accuracy_score(y_departure_test, y_departure_pred)
arrival_accuracy = accuracy_score(y_arrival_test, y_arrival_pred)

# Print the accuracy of the models
print(f"Departure city prediction accuracy: {departure_accuracy * 100:.2f}%")
print(f"Arrival city prediction accuracy: {arrival_accuracy * 100:.2f}%")

Departure city prediction accuracy: 41.50%
Arrival city prediction accuracy: 50.75%


### Without detection of cities

In [118]:
# Data preprocessing: Tokenization, lowercasing, and punctuation removal
all_sentences = [sentence.lower().replace(r'[^\w\s]', '') for sentence in all_sentences]

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(all_sentences)

# Label encoding for departure and arrival cities
y_departure_encoded = all_departure_cities
y_arrival_encoded = all_arrival_cities

# Split the data into training and testing sets
X_train, X_test, y_departure_train, y_departure_test = train_test_split(
    X, y_departure_encoded, test_size=0.2, random_state=42)
X_train, X_test, y_arrival_train, y_arrival_test = train_test_split(
    X, y_arrival_encoded, test_size=0.2, random_state=42)

# Train a Support Vector Machine (SVM) classifier for departure cities
departure_classifier = SVC(kernel='linear')
departure_classifier.fit(X_train, y_departure_train)

# Train a Random Forest classifier for arrival cities
arrival_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
arrival_classifier.fit(X_train, y_arrival_train)

# Make predictions on the test data
y_departure_pred = departure_classifier.predict(X_test)
y_arrival_pred = arrival_classifier.predict(X_test)

# Calculate accuracy
departure_accuracy = accuracy_score(y_departure_test, y_departure_pred)
arrival_accuracy = accuracy_score(y_arrival_test, y_arrival_pred)

# Print the accuracy of the models
print(f"Departure city prediction accuracy: {departure_accuracy * 100:.2f}%")
print(f"Arrival city prediction accuracy: {arrival_accuracy * 100:.2f}%")

Departure city prediction accuracy: 50.25%
Arrival city prediction accuracy: 45.50%


In [91]:
df_pred = df[-y_departure_pred.size:]
df_pred.loc[df_pred.index, "prediction_departure"] = y_departure_pred
df_pred.loc[df_pred.index, "prediction_arrival"] = y_arrival_pred

df_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,sentence,departure,arrival,prediction_departure,prediction_arrival
1600,Je veux prendre le train de Clermont-Ferrand à...,Clermont-Ferrand,Saint-Denis,Saint-Denis,Saint-Denis
1601,Le train de Paris à Rouen offre une vue magnif...,Paris,Rouen,Béziers,Le Mans
1602,Je prévois de voyager de Rouen à Marseille en ...,Rouen,Marseille,Saint-Étienne,Saint-Étienne
1603,"Pour mes prochaines vacances, je vais de Bézie...",Béziers,Tourcoing,Le Mans,Rouen
1604,Je souhaite découvrir Calais en prenant le tra...,Nîmes,Calais,Nîmes,Lille
...,...,...,...,...,...
1995,Je veux prendre le train de Bordeaux à La Roch...,Bordeaux,La Rochelle,Bordeaux,Limoges
1996,Le train est le meilleur moyen de se rendre de...,Nice,Lille,Nîmes,Nîmes
1997,"Allons de Pau à Toulon en train, qu'en penses-...",Pau,Toulon,Toulon,Calais
1998,Je souhaite découvrir Marseille en prenant le ...,Orléans,Marseille,Toulon,Béziers


In [109]:
# Load the dataset
df = pd.read_csv("Sentences.csv")

# Separate the data into input sentences and departure/arrival cities
all_sentences = df["sentence"]
all_departure_cities = df["departure"]
all_arrival_cities = df["arrival"]

# Data preprocessing: Tokenization, lowercasing, and punctuation removal
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation
    return text

all_sentences = [preprocess_text(sentence) for sentence in all_sentences]

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(all_sentences)

# Label encoding for departure and arrival cities
y_departure_encoded = all_departure_cities
y_arrival_encoded = all_arrival_cities

# Split the data into training and testing sets
X_train, X_test, y_departure_train, y_departure_test = train_test_split(
    X, y_departure_encoded, test_size=0.2, random_state=42)
X_train, X_test, y_arrival_train, y_arrival_test = train_test_split(
    X, y_arrival_encoded, test_size=0.2, random_state=42)

# Train a Support Vector Machine (SVM) classifier for departure cities
departure_classifier = SVC(C=0.1, kernel='linear')
departure_classifier.fit(X_train, y_departure_train)

# Train a Random Forest classifier for arrival cities
arrival_classifier = RandomForestClassifier(n_estimators=10, random_state=42)
arrival_classifier.fit(X_train, y_arrival_train)

# Make predictions on the test data
y_departure_pred = departure_classifier.predict(X_test)
y_arrival_pred = arrival_classifier.predict(X_test)

# Calculate accuracy
departure_accuracy = accuracy_score(y_departure_test, y_departure_pred)
arrival_accuracy = accuracy_score(y_arrival_test, y_arrival_pred)

# Print the accuracy of the models
print(f"Departure city prediction accuracy: {departure_accuracy * 100:.2f}%")
print(f"Arrival city prediction accuracy: {arrival_accuracy * 100:.2f}%")

Departure city prediction accuracy: 52.25%
Arrival city prediction accuracy: 44.25%


In [110]:
df_pred = df[-y_departure_pred.size:]
df_pred.loc[df_pred.index, "prediction_departure"] = y_departure_pred
df_pred.loc[df_pred.index, "prediction_arrival"] = y_arrival_pred

df_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


Unnamed: 0,sentence,departure,arrival,prediction_departure,prediction_arrival
1600,Je veux prendre le train de Clermont-Ferrand à...,Clermont-Ferrand,Saint-Denis,Saint-Denis,Saint-Denis
1601,Le train de Paris à Rouen offre une vue magnif...,Paris,Rouen,Béziers,Le Mans
1602,Je prévois de voyager de Rouen à Marseille en ...,Rouen,Marseille,Saint-Étienne,Lyon
1603,"Pour mes prochaines vacances, je vais de Bézie...",Béziers,Tourcoing,Le Mans,Paris
1604,Je souhaite découvrir Calais en prenant le tra...,Nîmes,Calais,Nîmes,Lille
...,...,...,...,...,...
1995,Je veux prendre le train de Bordeaux à La Roch...,Bordeaux,La Rochelle,Limoges,Limoges
1996,Le train est le meilleur moyen de se rendre de...,Nice,Lille,Paris,Nîmes
1997,"Allons de Pau à Toulon en train, qu'en penses-...",Pau,Toulon,Calais,Calais
1998,Je souhaite découvrir Marseille en prenant le ...,Orléans,Marseille,Toulon,Béziers


### Find HyperParams

In [96]:
# Load the dataset
df = pd.read_csv("Sentences.csv")

# Separate the data into input sentences and departure/arrival cities
all_sentences = df["sentence"]
all_departure_cities = df["departure"]
all_arrival_cities = df["arrival"]

# Data preprocessing: Tokenization, lowercasing, and punctuation removal
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])  # Remove punctuation
    return text

all_sentences = [preprocess_text(sentence) for sentence in all_sentences]

# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X = tfidf_vectorizer.fit_transform(all_sentences)

# Label encoding for departure and arrival cities
y_departure_encoded = all_departure_cities
y_arrival_encoded = all_arrival_cities

# Split the data into training and testing sets
X_train, X_test, y_departure_train, y_departure_test = train_test_split(
    X, y_departure_encoded, test_size=0.2, random_state=42)
X_train, X_test, y_arrival_train, y_arrival_test = train_test_split(
    X, y_arrival_encoded, test_size=0.2, random_state=42)

# Hyperparameter tuning for Support Vector Machine (SVM)
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
}

svm_grid_search = GridSearchCV(SVC(), param_grid_svm, cv=5, n_jobs=-1)
svm_grid_search.fit(X_train, y_departure_train)

best_svm = svm_grid_search.best_estimator_

# Hyperparameter tuning for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
}

rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, n_jobs=-1)
rf_grid_search.fit(X_train, y_arrival_train)

best_rf = rf_grid_search.best_estimator_

# Make predictions on the test data
y_departure_pred = best_svm.predict(X_test)
y_arrival_pred = best_rf.predict(X_test)

# Calculate accuracy
departure_accuracy = accuracy_score(y_departure_test, y_departure_pred)
arrival_accuracy = accuracy_score(y_arrival_test, y_arrival_pred)

# Print the best hyperparameters and accuracy
print("Best SVM hyperparameters:", best_svm)
print("Best Random Forest hyperparameters:", best_rf)
print(f"Departure city prediction accuracy: {departure_accuracy * 100:.2f}%")
print(f"Arrival city prediction accuracy: {arrival_accuracy * 100:.2f}%")

Best SVM hyperparameters: SVC(C=0.1, kernel='linear')
Best Random Forest hyperparameters: RandomForestClassifier(max_depth=10, random_state=42)
Departure city prediction accuracy: 52.25%
Arrival city prediction accuracy: 51.50%
