In [None]:
# Connexion au drive pour avoir accès au dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Modèles de recommandation

## Modèle 1 : TF-IDF

In [None]:
import pandas as pd

# Création d'un dataframe à partir de notre dataset
df = pd.read_csv("/content/drive/Shareddrives/Chatbot/mpst_full_data.csv")
df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Création d'un Vectorizer de TF IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Applique le Vectorizer aux descriptions de films du dataset
# Retourne une matrice TF IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(df['plot_synopsis'])

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Calcul de la similarité cosine entre les vecteurs TF IDF
# linear_kernel calcule le produit scalaire entre tous les vecteurs de la matrice
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# Cette fonction prend en input la saisie de l'utilisateur dans le chatbot et renvoie une liste de film correspondant à l'input
def recommend_movies(input, cosine_sim=cosine_sim):
  # Prétraitement de l'input
  input = input.lower()
  length = len(input)

  # Si l'input n'est pas un titre ou un titre de film mais pas présent dans le dataset
  if input not in df["title"].str.lower().values:
    # Vectorisation de l'input
    vector = tfidf_vectorizer.transform([input])
    # Calcul de la similarité entre le vecteur et la matrice
    sim_scores = linear_kernel(vector, tfidf_matrix).flatten()
    # Tri et sélection des 10 meilleurs recommandations
    movies_ind = sim_scores.argsort()[::-1]
    movies_ind = movies_ind[0:10]
    recommended_movies = df["title"].iloc[movies_ind]

  # Si l'input est un titre de film présent dans le dataset
  else:
    # Récupération de l'indice
    idx = df[df["title"].str.lower() == input.lower()].index[0]
    # Récupération des scores de similarité avec tous les autres films
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Tri par ordre décroissant et sélection des 10 meilleurs recommandations en excluant le film d'origine
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: 11]
    movies_ind = [i[0] for i in sim_scores]
    recommended_movies = df["title"].iloc[movies_ind]

  return recommended_movies

In [None]:
# Test de le fonction précedente avec comme input "the dark knight"
user_input = "the dark knight"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

2507     Batman: The Dark Knight Returns, Part 2
3211     Batman: The Dark Knight Returns, Part 1
4951                    Batman: The Killing Joke
7990                      Batman: Arkham Origins
1309                                      Batman
13045                      Batman: Arkham Asylum
12336                      Batman: Arkham Knight
2427                         Batman: Arkham City
49            Batman Beyond: Return of the Joker
2669                  Batman: Under the Red Hood
Name: title, dtype: object


In [None]:
# Test de le fonction précedente avec comme input "A young wizard learns about magic"
user_input = "A young wizard learns about magic"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

12247    Princess Gwenevere and the Jewel Riders
10106                              Strait Jacket
7349                       The Flight of Dragons
7742      The Care Bears Adventure in Wonderland
2510                                 August Rush
8172                        Legend of the Seeker
7827                  The Wonderful Wizard of Oz
10219                                      Troll
11455                                   Maburaho
14623                                    The Wiz
Name: title, dtype: object


In [None]:
# Test de le fonction précedente avec comme input "I like movies about psychopath."
user_input = "I like movies about psychopath."
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

13580           Cold Heart
3152      Don't Let Him In
589           Stage Fright
8615           Blood Dolls
8173     Dillinger è morto
13987       Leonard Part 6
7364            Blue Movie
11570              Athidhi
12400     The Road Killers
3090     Gas, Food Lodging
Name: title, dtype: object


## Modèle 2 : Bag of Words

In [None]:
import pandas as pd

# Création d'un dataframe à partir de notre dataset
df = pd.read_csv("/content/drive/Shareddrives/Chatbot/mpst_full_data.csv")
df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Création d'un Vectorizer de BoW
count_vectorizer = CountVectorizer(stop_words='english')

# Applique le Vectorizer aux descriptions de films du dataset
# Retourne une matrice BoW
count_matrix = count_vectorizer.fit_transform(df['plot_synopsis'])

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Calcul de la similarité cosine entre les vecteurs TF IDF
cosine_sim = linear_kernel(count_matrix, count_matrix)

In [None]:
# Même fonction que TF IDF
def recommend_movies(input, cosine_sim=cosine_sim):
  input = input.lower()
  length = len(input)

  if input not in df["title"].str.lower().values:
    vector = count_vectorizer.transform([input])
    sim_scores = linear_kernel(vector, count_matrix).flatten()
    movies_ind = sim_scores.argsort()[::-1]
    movies_ind = movies_ind[0:10]
    recommended_movies = df["title"].iloc[movies_ind]

  else:
    idx = df[df["title"].str.lower() == input.lower()].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: 11]
    movies_ind = [i[0] for i in sim_scores]
    recommended_movies = df["title"].iloc[movies_ind]

  return recommended_movies

In [None]:
user_input = "the dark knight"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

4794                  The Dark Knight Rises
329                           Batman Begins
49       Batman Beyond: Return of the Joker
5753                        Men in Black II
5754                Was tun, wenn's brennt?
1309                                 Batman
12336                 Batman: Arkham Knight
5872                           Men in Black
4211                        American Psycho
1287                        Poltergeist III
Name: title, dtype: object


In [None]:
user_input = "A young wizard learns about magic"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

11458                           Un chien andalou
1292                  Ghosts of Girlfriends Past
12247    Princess Gwenevere and the Jewel Riders
3277                        Hauru no ugoku shiro
1391                             Vampire Academy
2994                                   Self/less
4666                                Dragonslayer
2929                           Human Trafficking
5198                   Oz the Great and Powerful
10437                               Yeogo goedam
Name: title, dtype: object


In [None]:
user_input = "I like movies about psychopath."
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

4055                                         Jules et Jim
1541                                  Det sjunde inseglet
4211                                      American Psycho
4301                                                 Milk
5753                                      Men in Black II
1292                           Ghosts of Girlfriends Past
2752                                Nymphomaniac: Vol. II
1963    Alphaville, une étrange aventure de Lemmy Caution
5661                                   30 Minutes or Less
2649                                                Shrek
Name: title, dtype: object


## Modèle : Word2Vec

In [None]:
import pandas as pd

# Création d'un dataframe à partir de notre dataset
df = pd.read_csv("/content/drive/Shareddrives/Chatbot/mpst_full_data.csv")
df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb


### Cleaning (pas besoin d'exécuter cette partie : le dataset est déjà présent dans le drive)

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import string

# Preprocessings
def clean_text(text):
    text = "".join([char for char in text if char not in string.punctuation])
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [None]:
df['clean_plot'] = df['plot_synopsis'].apply(lambda x: clean_text(x))

In [None]:
df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,clean_plot
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb,note synopsis orginal italian release segments...
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb,two thousand years ago nhagruul foul sorcerer ...
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb,matuscheks gift store budapest workplace alfre...
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb,glenn holland morning person anyones standards...
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb,may 1980 cuban man named tony montana al pacin...


In [None]:
# Sauvegarde du nouveau dataframe
df.to_csv('/content/drive/Shareddrives/Chatbot/mpst_data_clean.csv', index=False)

### Modèle 3 : modèle personalisé

#### Création du modèle (pas besoin d'exécuter cette partie : le dataset est déjà présent dans le drive)

In [None]:
import pandas as pd

# Création d'un dataframe à partir de notre dataset
df = pd.read_csv("/content/drive/Shareddrives/Chatbot/mpst_data_clean.csv")
df.head()

Unnamed: 0,imdb_id,title,plot_synopsis,tags,split,synopsis_source,clean_plot
0,tt0057603,I tre volti della paura,Note: this synopsis is for the orginal Italian...,"cult, horror, gothic, murder, atmospheric",train,imdb,note synopsis orginal italian release segments...
1,tt1733125,Dungeons & Dragons: The Book of Vile Darkness,"Two thousand years ago, Nhagruul the Foul, a s...",violence,train,imdb,two thousand years ago nhagruul foul sorcerer ...
2,tt0033045,The Shop Around the Corner,"Matuschek's, a gift store in Budapest, is the ...",romantic,test,imdb,matuscheks gift store budapest workplace alfre...
3,tt0113862,Mr. Holland's Opus,"Glenn Holland, not a morning person by anyone'...","inspiring, romantic, stupid, feel-good",train,imdb,glenn holland morning person anyones standards...
4,tt0086250,Scarface,"In May 1980, a Cuban man named Tony Montana (A...","cruelty, murder, dramatic, cult, violence, atm...",val,imdb,may 1980 cuban man named tony montana al pacin...


In [None]:
from gensim.models import Word2Vec

sentences = [row.split() for row in df["clean_plot"]]

# Création du modèle Word2Vec avec comme données les descriptions de films avec le preprocessing
model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4, sg=0)

# Sauvegarde du modèle
model.save("/content/drive/Shareddrives/Chatbot/word2vec.model")

#### Utilisation du modèle

In [None]:
from gensim.models import Word2Vec

model = Word2Vec.load("/content/drive/Shareddrives/Chatbot/word2vec.model")

In [None]:
import numpy as np

# Cette fonction calcule le vecteur moyen en prenant la moyenne de chaque vecteur des mots présents dans le modèle
def get_mean_vector(model, words):
    words = [word for word in words if word in model.wv.key_to_index]
    if len(words) >= 1:
        return np.mean(model.wv[words], axis=0)
    else:
        return []

df['mean_vector'] = df['clean_plot'].apply(lambda x: get_mean_vector(model, x.split()))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Cette fonction prend en input la saisie de l'utilisateur dans le chatbot et renvoie une liste de film correspondant à l'input
def recommend_movies(input_text):
    # Prétraitement de l'input
    clean_input = clean_text(input_text)
    # Calcule le vecteur moyen de l'input
    input_vector = get_mean_vector(model, clean_input.split())

    # Calcule la similarité cosinus entre le vecteur moyen de l'input et les vecteurs moyens du dataframe
    df['similarity'] = df['mean_vector'].apply(lambda x: cosine_similarity([x], [input_vector]).flatten()[0])

    # Renvoie les 10 meilleures recommandations
    return df.sort_values(by='similarity', ascending=False)[['title', 'similarity']].head(10)

In [None]:
user_input = "the dark knight"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

                                               title  similarity
14093                                   Fantaghirò 2    0.675486
11987                          Jack the Giant Killer    0.649970
6891                           Jack the Giant Killer    0.649970
11839  Ils se marièrent et eurent beaucoup d'enfants    0.649951
6428                              Happily Ever After    0.649951
13678              Prince of Persia: The Two Thrones    0.639082
14080                                Teenage Caveman    0.637490
12247        Princess Gwenevere and the Jewel Riders    0.609445
10579               Prince of Persia: Warrior Within    0.604735
7671                                          Himiko    0.602087


In [None]:
user_input = "A young wizard learns about magic"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

                                                   title  similarity
1301                                         The Prodigy    0.668648
12247            Princess Gwenevere and the Jewel Riders    0.617122
7349                               The Flight of Dragons    0.599905
10931                                          Wing Chun    0.599185
3407                                           The Mummy    0.582041
8161                                      The Great Seer    0.580648
8034                    Snow White and the Three Stooges    0.574858
10810                 Lands of Lore: The Throne of Chaos    0.573425
12325  Gekijou-ban Mahou Shoujo Madoka*Magica: [Zenpe...    0.570407
1927                                              San wa    0.569727


In [None]:
user_input = "I like movies about psychopath."
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

                     title  similarity
10077             Dog City    0.694413
9355            The Critic    0.685568
12605           The Critic    0.685568
12470            Illusions    0.660025
4562   Alice in Wonderland    0.633407
10315             Seinfeld    0.626778
4964             The Quest    0.625632
5328                Aswang    0.623450
1764              Gigantic    0.618974
8322         Don't Look Up    0.618649


### Modèle 4 : modèle pré-entrainé

#### Téléchargement du modèle (pas besoin d'exécuter cette partie : le dataset est déjà présent dans le drive)

In [None]:
import gensim.downloader as api
model = api.load('word2vec-google-news-300')



In [None]:
model.save("/content/drive/Shareddrives/Chatbot/google.model")

#### Utilisation du modèle

In [None]:
from gensim.models import KeyedVectors

model = KeyedVectors.load("/content/drive/Shareddrives/Chatbot/google.model")

In [None]:
import numpy as np

# Même fonction que le modèle Word2Vec personnalisé
def get_mean_vector(model, words):
    words = [word for word in words if word in model.key_to_index]
    if len(words) >= 1:
        return np.mean(model[words], axis=0)
    else:
        return []

df['mean_vector'] = df['clean_plot'].apply(lambda x: get_mean_vector(model, x.split()))

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Même fonction que le modèle Word2Vec personnalisé
def recommend_movies(input_text):
    clean_input = clean_text(input_text)
    input_vector = get_mean_vector(model, clean_input.split())

    df['similarity'] = df['mean_vector'].apply(lambda x: cosine_similarity([x], [input_vector]).flatten()[0])

    return df.sort_values(by='similarity', ascending=False)[['title', 'similarity']].head(10)

In [None]:
user_input = "the dark Knight"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

                                               title  similarity
9539          King Arthur and the Knights of Justice    0.625641
14093                                   Fantaghirò 2    0.621938
13382                           L'armata Brancaleone    0.585123
5727                     Snow White and the Huntsman    0.581870
5283                              Puen yai jon salad    0.575551
11839  Ils se marièrent et eurent beaucoup d'enfants    0.574915
6428                              Happily Ever After    0.574915
12870                 Scooby-Doo and the Goblin King    0.570638
9323               Scooby-Doo! And the Samurai Sword    0.569060
13599                                    Sailor Moon    0.568102


In [None]:
user_input = "A young wizard learns about magic"
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

                                          title  similarity
10451                                    Aladin    0.678756
8942                                     Vidocq    0.668446
7577                                  Winx Club    0.648193
8733    Winx Club: Il segreto del Regno Perduto    0.648193
7349                      The Flight of Dragons    0.647758
1301                                The Prodigy    0.633648
3890                              Your Highness    0.632788
14475                     Eolguleobtneun minyeo    0.632268
10745  Harry Potter and the Philosopher's Stone    0.630232
1328      Harry Potter and the Sorcerer's Stone    0.630232


In [None]:
user_input = "I like movies about psychopath."
recommended_movies = recommend_movies(user_input)
print(recommended_movies)

                         title  similarity
9165  The Kentucky Fried Movie    0.655495
5328                    Aswang    0.614513
8615               Blood Dolls    0.605408
3941            The Human Race    0.597606
9068    Berberian Sound Studio    0.597284
1019               The Funeral    0.594104
273                        Die    0.588792
8322             Don't Look Up    0.587917
1306                     Rinne    0.586436
6762               Jigarthanda    0.586429


# Chatbot


## Création du chatbot

### Installation de llama-cpp-python et des packages nécessaires

In [None]:
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.20.tar.gz (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.20-cp310-cp310-manylinux_2_35_x86_64.whl size=7138406 sha256=b96852c0d841b8feccf66732577151e05902a86c590b3c4de95d20e94a4fa4d3
  Stored in directory: /root/.cache/pip/wheels/ef/f2/d2/0becb03047a348d7bd9a5b91ec88f4654d6fa7d67ea4e84d43
Successfully built llama-cpp-python
Installing collected packages: llama-cpp-python
Successfully installed llama-cpp-python-0.2.20


In [None]:
!git clone https://github.com/ggerganov/llama.cpp.git
!pip install -r llama.cpp/requirements.txt

Cloning into 'llama.cpp'...
remote: Enumerating objects: 12959, done.[K
remote: Counting objects: 100% (3527/3527), done.[K
remote: Compressing objects: 100% (289/289), done.[K
remote: Total 12959 (delta 3370), reused 3294 (delta 3237), pack-reused 9432[K
Receiving objects: 100% (12959/12959), 15.47 MiB | 23.94 MiB/s, done.
Resolving deltas: 100% (9012/9012), done.
Collecting numpy==1.24.4 (from -r llama.cpp/requirements.txt (line 1))
  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.3/17.3 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece==0.1.98 (from -r llama.cpp/requirements.txt (line 2))
  Downloading sentencepiece-0.1.98-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gguf>=0.1.0 (fro

In [None]:
!pip install huggingface_hub langchain

Collecting langchain
  Downloading langchain-0.0.348-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-core<0.1,>=0.0.12 (from langchain)
  Downloading langchain_core-0.0.12-py3-none-any.whl (181 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m181.5/181.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langsmith<0.1.0,>=0.0.63 (from langchain)
  Downloading langsmith-0.0.69-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Downloading ma

In [None]:
# Téléchargement du modèle à partir du hub huggingface
# snapshot_download sert à sauvegarder le modèle localement
from huggingface_hub import snapshot_download

model_id="mistralai/Mistral-7B-Instruct-v0.1"
snapshot_download(repo_id=model_id, local_dir="mistral-hf",
                  local_dir_use_symlinks=False, revision="main")

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.85k [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

'/content/mistral-hf'

In [None]:
# Conversion du modèle vers un format utilisable par la library (gguf)
!python llama.cpp/convert.py mistral-hf \
  --outfile mistral-7B-Instruct-v0.1.gguf \
  --outtype f16

Loading model file mistral-hf/pytorch_model-00001-of-00002.bin
Loading model file mistral-hf/pytorch_model-00001-of-00002.bin
Loading model file mistral-hf/pytorch_model-00002-of-00002.bin
params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=32768, n_ff=14336, n_head=32, n_head_kv=8, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=<GGMLFileType.MostlyF16: 1>, path_model=PosixPath('mistral-hf'))
Loading vocab file 'mistral-hf/tokenizer.model', type 'spm'
Permuting layer 0
Permuting layer 1
Permuting layer 2
Permuting layer 3
Permuting layer 4
Permuting layer 5
Permuting layer 6
Permuting layer 7
Permuting layer 8
Permuting layer 9
Permuting layer 10
Permuting layer 11
Permuting layer 12
Permuting layer 13
Permuting layer 14
Permuting layer 15
Permuting layer 16
Permuting layer 17
Permuting layer 18
Permuting layer 19
Permuting layer 20
Permuting layer 21
Permuting layer 22
Permuting layer 23
Perm

### Chargement du modèle et test

In [None]:
from langchain.llms import LlamaCpp

llm = LlamaCpp(
  model_path="mistral-7B-Instruct-v0.1.gguf",
  temperature=0.,
  max_tokens=200,
  top_p=1,
  n_gpu_layers = 100,    # nombre de couches à chargers sur le GPU
  verbose=True
)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | 


In [None]:
from langchain.prompts import PromptTemplate

#Création d'un template pour tester notre modèle
template = PromptTemplate.from_template(
    """
      Recommend me 5 other movies (only title) that match: {text}
    """
)

In [None]:
#Test du modèle
response = llm(template.format(text="The Dark Knight"))

In [None]:
print(response)


1. Inception
2. The Prestige
3. The Departed
4. The Matrix
5. The Silence of the Lambs


## RAG (passer cette étape si cela prend trop de temps)

In [None]:
# Connexion au drive pour avoir accès au dataset
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader
csvLoader = CSVLoader("/content/drive/Shareddrives/Chatbot/mpst_data_clean.csv")
documents = csvLoader.load()

In [None]:
# Division des documents en morceaux
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)

In [None]:
!pip install -U sentence-transformers

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
modelsentencetransformer = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings':False}
embeddings = HuggingFaceEmbeddings(
  model_name = 'all-MiniLM-L6-v2',
  model_kwargs = model_kwargs,
  encode_kwargs=encode_kwargs
)

In [None]:
!pip install chromadb

In [None]:
# Recherche de similarité avec la question donnée
from langchain.vectorstores import Chroma
db = Chroma.from_documents(docs, embeddings)
question = "what's batman genre ?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

In [None]:
from langchain.prompts import PromptTemplate

template = PromptTemplate.from_template(
    """
      Use your knowlege and the data given to recommend 5 movies similar to the movie given , If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible.
      Recommend me 5 other movies (only title) that match: {text}
    """
)

In [None]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
  llm=llm,
  chain_type="stuff",
  retriever=db.as_retriever(),
  chain_type_kwargs={"prompt": template}
)
question = "Batman"
result = qa_chain ({ "query" : question })
print(result["result"])

## Interface

In [None]:
# Connexion au drive pour avoir accès au dataset
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

#Création d'un dataframe à partir de notre dataset
df = pd.read_csv("/content/drive/Shareddrives/Chatbot/mpst_full_data.csv")

In [None]:
!pip install ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [None]:
import ipywidgets as widgets
from IPython.display import display

# Création d'une classe ChatbotGUI à l'aide ipywidgets pour l'interface
class ChatbotGUI:
    def __init__(self):
        self.chat_history = ""
        self.create_widgets()

    def create_widgets(self):
        # Création d'un widget de sortie
        self.output_text = widgets.Output()
        with self.output_text:
            print(f"Chatbot: Hey, I'm a recommendation movie chatbot.")
        display(self.output_text)

        # Création d'un champ de saisie pour l'utilisateur
        self.input_entry = widgets.Text(placeholder='Type here...', layout=widgets.Layout(width='400px'))
        display(self.input_entry)

        # Création d'un bouton "Send" pour soumettre la saisie de l'utilisateur
        self.send_button = widgets.Button(description='Send')
        self.send_button.on_click(self.get_response)
        display(self.send_button)

    def get_response(self, b):
        # Récupération de l'entrée utilisateur et réinitialisation du champ de texte
        user_input = self.input_entry.value
        self.input_entry.value = ""
        # Mise à jour de l'historique du chat
        self.chat_history += f"You: {user_input}\n"
        with self.output_text:
            print(f"You: {user_input}")
            response = self.get_chatbot_response(user_input)
            print(f"{response}")

    def get_chatbot_response(self, user_input):
        try:
            # Appel du modèle pour obtenir une réponse basée sur l'entrée utilisateur
            response = llm(template.format(text=user_input))
            # Extraction des films à partir de la réponse
            movies = [line.split('. ', 1)[1] for line in response.split('\n') if line]
            chatbot_response = "Chatbot: Here some recommendations: \n"
            counter = 1

            for movie in movies:
              # Si le film recommandé est présent dans le dataset, alors le chatbot affiche la description ainsi que les genres de ce film
              if movie.lower() in df["title"].str.lower().values:
                movie_data = df[df["title"].str.lower() == movie.lower()]
                recommendation = f"{counter}. {movie_data['title'].values[0]}\nSynopsis: {movie_data['plot_synopsis'].values[0][:500]}...\nTags: {movie_data['tags'].values[0]}\n\n"
                chatbot_response += recommendation
              # Si le film recommandé n'est pas présent dans le dataset, alors le chatbot n'affiche que le titre
              else:
                recommendation = f"{counter}. {movie}\n\n"
                chatbot_response += recommendation
              counter += 1

            return chatbot_response
        except Exception as e:
            # Gestion des erreurs en affichant un message générique
            print(f"An error occurred: {e}")
            return "I'm sorry, but I'm having trouble providing a response right now."

In [None]:
# Création d'une instance de la classe ChatbotGUI
chatbot = ChatbotGUI()

Output()

Text(value='', layout=Layout(width='400px'), placeholder='Type here...')

Button(description='Send', style=ButtonStyle())