In [1]:
#import library

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#Data set

df = pd.read_csv("jokes.csv")

In [3]:
df.info

<bound method DataFrame.info of           ID                                           Question  \
0          1  Did you hear about the Native American man tha...   
1          2       What's the best anti diarrheal prescription?   
2          3  What do you call a person who is outside a doo...   
3          4  Which Star Trek character is a member of the m...   
4          5  What's the difference between a bullet and a h...   
...      ...                                                ...   
38264  38265  Q: Why did the pacifist /b/tard try to calm ev...   
38265  38266            Q: Why can't Obama poke fun at himself?   
38266  38267             Why is gambling not allowed in Africa?   
38267  38268       What do you call three witches in a hot tub?   
38268  38269  What do scientists use to measure a chicken's ...   

                                     Answer  
0       He nearly drown in his own tea pee.  
1                          Mycheexarphlexin  
2                        

In [4]:
#Show Data at head

df.head(10)

Unnamed: 0,ID,Question,Answer
0,1,Did you hear about the Native American man tha...,He nearly drown in his own tea pee.
1,2,What's the best anti diarrheal prescription?,Mycheexarphlexin
2,3,What do you call a person who is outside a doo...,Matt
3,4,Which Star Trek character is a member of the m...,Jean-Luc Pickacard
4,5,What's the difference between a bullet and a h...,A bullet doesn't miss Harambe
5,6,Why was the Ethiopian baby crying?,He was having a mid-life crisis
6,7,What's the difference between a corn husker wi...,One shucks between fits...
7,8,Who is 2016's biggest sellout?,Kevin Durant or Bernie Sanders?
8,9,Why is little Annie's shoe floating in the sea?,Because the shark burped.
9,10,What's the difference between a married man an...,"A bachelor will go to the fridge, sees nothing..."



# Cleaning the dataset:

In [5]:
#Drop ID colomn

df=df.drop(['ID'], axis=1)

In [6]:
#Show Data at Random

df.sample(10)

Unnamed: 0,Question,Answer
9424,How do you start making big bucks?,With a little doe
8028,"What's big, black, and explodes when shaken?",A COKe bottle.
31790,"What's the definition of ""trust""?",Two cannibals doing 69.
8,Why is little Annie's shoe floating in the sea?,Because the shark burped.
29371,What do you call someone who believes rotten e...,An eggsy-stench-alist.
33896,What's the most useless thing on a woman?,A drunk Irishman.
20677,What do you call a woman that raps about woman...,Feminem
16422,What's your most fucked up joke?,I'm looking for the most cancerous jokes you c...
16044,When's sex not work?,When it's for play.
3736,What do you call a fly with no wings?,A walk.


In [7]:

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
#removing special characters, numbers, and stopwords

def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    text = text.split()
    text = [lemmatizer.lemmatize(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    return text

In [9]:
df['cleaned_Question'] = df['Question'].apply(clean_text)
df['cleaned_Answer'] = df['Answer'].apply(clean_text)

In [11]:
df.sample(10)

Unnamed: 0,Question,Answer,cleaned_Question,cleaned_Answer
10438,Do you know how electricity works?,The truth may shock you.,know electricity work,truth may shock
35894,What did the lesbian vampire say to her girlfr...,See you next month!,lesbian vampire say girlfriend,see next month
2669,Why can't the T-rex clap its hands?,Because it's dead.,rex clap hand,dead
35833,How do you know you're getting old?,"When you exit a museum, you trigger the alarm.",know getting old,exit museum trigger alarm
10355,What do Caitlyn Jenner and my Chrysler have in...,They're both convertables.,caitlyn jenner chrysler common,convertables
21946,What do you call a person who is half Jewish?,Jew-ish,call person half jewish,jew ish
6678,What do you tell a girl with two black eyes?,"Nothing, you already told her twice.",tell girl two black eye,nothing already told twice
18669,Why is a roach clip called a roach clip?,Because pot holder was already taken -rim shot-,roach clip called roach clip,pot holder already taken rim shot
8826,What do you call an alien that's also a pedoph...,An Extramolestrial,call alien also pedophile,extramolestrial
16972,What do you call attempting to jump to light s...,A Wookie mistake!,call attempting jump light speed checking hype...,wookie mistake


In [14]:
# Vectorize the text 
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_Question'])
y = vectorizer.transform(df['cleaned_Answer'])

In [19]:
#Training the chatbot
def generate_response(user_input):
    user_input = clean_text(user_input)
    user_input_vector = vectorizer.transform([user_input])
    similarities = cosine_similarity(user_input_vector, X).flatten()
    index = np.argmax(similarities)
    return df['cleaned_Answer'][index]

In [None]:
# Testing the chatbot
while True:
    user_input = input('You: ')
    if user_input.lower() == 'quit':
        break
    response = generate_response(user_input)
    print('Chatbot:', response)

You: Do you know how electricity works?	
Chatbot: truth may shock
You: call alien also pedophile	
Chatbot: extramolestrial
You: Why is a roach clip called a roach clip?	
Chatbot: pot holder already taken rim shot
