# Importation of Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import RocCurveDisplay
from sklearn.datasets import load_wine
import torch
import transformers as ppb
from sklearn.metrics import log_loss

## Start time

In [2]:
import time
start=time.time()

## Importation of data

In [3]:
df = pd.read_csv("Book12.csv", encoding='cp1252', header=None)

In [4]:
df.shape

(13816, 2)

In [5]:
df.head()

Unnamed: 0,0,1
0,Pour quelque chose d aussi splendide que ce fi...,0
1,"Ceci est un film etonnant, un tour de force un...",0
2,"Peut etre plus genial qu ingenieux, mais cela ...",0
3,Il y a une liberte de regarder des cascades qu...,0
4,"Si le smoking etait reellement un costume, il ...",0


##### We will take all the data because i don't know how to shuffle it

In [6]:
batch_1 = df[1:13814]

## Loading the Pre-trained BERT model 



In [7]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

#for Bert
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')


# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##### Not enough memory on cuda

In [8]:
#device=torch.device("cuda" if torch.cuda.is_available() else "cpu") 
#print(device)

## Tokenization

In [9]:
#Break sentences into word and subwords for Bert
tokenized = batch_1[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [10]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [11]:
np.array(padded).shape

(13813, 101)

In [12]:
attention_mask = np.where(padded != 0, 1, 0)

##### Not enough memory on cuda


In [13]:
#model.to(device)

In [14]:
input_ids = torch.tensor(padded) 
#input_ids.to(device)

In [15]:
attention_mask = torch.tensor(attention_mask)
#attention_mask.to(device)

#### It took 30 minutes to load the model

In [None]:
with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

#with torch.no_grad():
#    last_hidden_states = model(input_ids.cuda(), attention_mask=attention_mask.cuda())

## Deep-learning
###### The model() function runs our senstences through BERT.
    The results of the processing will be returned into Last_hidden_states

In [17]:
input_ids.shape

torch.Size([13813, 101])

In [18]:
input_ids

tensor([[  101,  8292,  6895,  ...,     0,     0,     0],
        [  101, 21877,  4904,  ...,     0,     0,     0],
        [  101,  6335,  1061,  ...,     0,     0,     0],
        ...,
        [  101,  9145,  1010,  ...,     0,     0,     0],
        [  101,  2202,  2729,  ...,     0,     0,     0],
        [  101,  1996,  5896,  ...,     0,     0,     0]])

In [19]:
features = last_hidden_states[0][:,0,:]
#features = last_hidden_states[0][:,0,:].cpu()
features.shape

torch.Size([13813, 768])

## Model Train/Split

In [20]:
labels = batch_1[1]
labels

1        0
2        0
3        0
4        0
5        0
        ..
13809    1
13810    1
13811    1
13812    1
13813    1
Name: 1, Length: 13813, dtype: int64

In [21]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)
test_features.shape


torch.Size([3454, 768])

In [22]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [23]:
lr_clf.score(test_features, test_labels)

0.9971048060220035

## Test a French sentence

In [24]:
df2 = pd.read_csv('Book13-francais.csv', header=None)
batch_2 = df2

In [25]:
#Break sentences into word and subwords for Bert
tokenized2= df2[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
tokenized2

0    [101, 3802, 15068, 2072, 15333, 24086, 2015, 2...
Name: 0, dtype: object

In [26]:
padded2=tokenized2
attention_mask2 = np.where(padded2 != 0, 1, 0)
attention_mask2

array([1])

In [27]:
labels2=df2[1]
labels2

0    1
Name: 1, dtype: int64

In [28]:
input_ids2 = torch.tensor(tokenized2)  
attention_mask2=torch.tensor(attention_mask2)
with torch.no_grad():
    last_hidden_states2=model(input_ids2)

In [29]:
features2=last_hidden_states2[0][:,0,:].numpy()

lr_clf.decision_function()

In [31]:
def test(X):
    if lr_clf.decision_function(X)>=0:
        return('The sentence may be english')
    else:
        return('The sentence may be french')

In [32]:
test(features2)

'The sentence may be french'

## End Time

In [33]:
end=time.time()
print(end-start)

2147.238273382187
