In [1]:
import pandas as pd   #Veri seti üzerinde işlem yapmak
import numpy as np    #Matematik işlem 

In [2]:
df = pd.read_csv('atis_intents.csv') # Veri setinin okunması.

In [3]:
df.head(10)  #Veri setinin ilk 10 elemanını tablo olarak gösterir

Unnamed: 0,subject,review
0,atis_flight,i want to fly from boston at 838 am and arriv...
1,atis_flight,what flights are available from pittsburgh to...
2,atis_flight_time,what is the arrival time in san francisco for...
3,atis_airfare,cheapest airfare from tacoma to orlando
4,atis_airfare,round trip fares from pittsburgh to philadelp...
5,atis_flight,i need a flight tomorrow from columbus to min...
6,atis_aircraft,what kind of aircraft is used on a flight fro...
7,atis_flight,show me the flights from pittsburgh to los an...
8,atis_flight,all flights from boston to washington
9,atis_ground_service,what kind of ground transportation is availab...


In [4]:
df.info() # Veri seti hakkında genel bilgi. Boş değer olmadığı gözüküyor.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4978 entries, 0 to 4977
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  4978 non-null   object
 1   review   4978 non-null   object
dtypes: object(2)
memory usage: 77.9+ KB


In [5]:
df['subject'].value_counts() # Unique konuların veri miktarları. 22 adet konu bulunmakta.

atis_flight                                 3666
atis_airfare                                 423
atis_ground_service                          255
atis_airline                                 157
atis_abbreviation                            147
atis_aircraft                                 81
atis_flight_time                              54
atis_quantity                                 51
atis_flight#atis_airfare                      21
atis_airport                                  20
atis_distance                                 20
atis_city                                     19
atis_ground_fare                              18
atis_capacity                                 16
atis_flight_no                                12
atis_meal                                      6
atis_restriction                               6
atis_airline#atis_flight_no                    2
atis_ground_service#atis_ground_fare           1
atis_airfare#atis_flight_time                  1
atis_cheapest       

In [6]:
import nltk                                 #Dil kütüphanesi
from nltk.corpus import stopwords
import textblob
from textblob import Word

sw=stopwords.words("english")

In [7]:
def data_cleaning(df,choice): 
    df = df.applymap(lambda s: s.lower() if type(s) == str else s) # Tüm metinler küçük harf formatına dönüştürülür
    df["review"]=df["review"].str.replace("[^\w\s]","") # Noktalama işaretlerinin silinmesi
    df["review"]=df["review"].str.replace("\d","") # Sayıların silinmesi
    df["review"]=df["review"].apply(lambda x: " ".join(x for x in x.split() if x not in sw)) # Stopwords kelimelerin silinmesi
    if choice == 1:
        df["review"]=df["review"].apply(lambda x: " ".join(Word(i).lemmatize() for i in x.split())) # lemmatization
    elif choice == 2:
        df["review"]=df["review"].apply(lambda x: " ".join(Word(i).stem() for i in x.split())) # Stemming
    return df['review']

In [8]:
df['review']=data_cleaning(df, 1) # Veri temizleme

  df["review"]=df["review"].str.replace("[^\w\s]","") # Noktalama işaretlerinin silinmesi
  df["review"]=df["review"].str.replace("\d","") # Sayıların silinmesi


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer # Vektörleştirme için TF-IDF

In [10]:
vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1,2),dtype=np.byte)
X = vectorizer.fit_transform(df['review'].tolist())



In [11]:
from sklearn.preprocessing import LabelEncoder
# Tahminlemenin performansını değerlendirebilmek için stringler sayısal olarak ifade edilir.
le = LabelEncoder()
df['subject'] = le.fit_transform(df['subject'])

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,df['subject'],random_state=1,test_size=0.2) # Veri setinin bölünmesi.

In [14]:
df_cluster_train = pd.DataFrame(data={'cluster': y_train, 'vector': X_train.toarray().tolist()}) # Eğitim verisinin veri seti haline getirilmesi

In [15]:
df_cluster_test = pd.DataFrame(data={'cluster': y_test, 'vector': X_test.toarray().tolist()}) # Test verisinin veri seti haline getirilmesi

In [16]:
df_cluster_train['vector'] = df_cluster_train['vector'].apply(lambda x: np.array(x)) #eğitim veri setinin vektörü alınması
df_cluster_test['vector'] = df_cluster_test['vector'].apply(lambda x: np.array(x))   #test veri setinin vektörü alınması

In [17]:
df_cluster_train.head()

Unnamed: 0,cluster,vector
1038,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2781,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2139,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2641,3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1216,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [18]:
df_cluster_train.groupby('cluster')['vector'].apply(np.mean) # Küme merkezleri için vektörlerin ortalaması alınır.

cluster
0     [0.0, 0.0, 0.01300200368030033, 0.003842355876...
1     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
3     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
5     [0.0, 0.0, 0.011715458403966015, 0.0, 0.0, 0.0...
6     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
7     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
8     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
9     [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
10    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
11    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
12    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.591...
13    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
14    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
15    [0.00993094559106707, 0.00993094559106707, 0.0...
16    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
17    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0

In [19]:
centroids = df_cluster_train.groupby('cluster')['vector'].apply(np.mean).tolist() # Küme merkezlerinin listesi.
clusters = range(0,22)

In [20]:
df_cluster_test.head()

Unnamed: 0,cluster,vector
3547,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
248,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1653,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2689,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3128,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [21]:
from scipy import spatial

predicts = []
for i in df_cluster_test.vector:
    distances = []
    for j in centroids:
        distances.append(1 - spatial.distance.cosine(i,j)) # Her noktanın küme merkezlerine olan kosünüs benzerliğinin hesaplanması.
    index = distances.index(max(distances)) # En benzer olan küme merkezinin indisi alınır.
    predicts.append(clusters[index]) # Alınan indise göre bulunan kümenin adı tahmin olarak alınır.

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [22]:
df_cluster_test['predict'] = predicts # Tahminler test veri setine eklenir.

In [23]:
df_cluster_test.head()

Unnamed: 0,cluster,vector,predict
3547,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12
248,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12
1653,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12
2689,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",12
3128,12,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6


In [24]:
from sklearn.metrics import accuracy_score #doğruluk skorunun hesaplanabilmesi için gerekli olan kütüphane

pred = df_cluster_test.predict # Tahminler liste olarak alınır.
true = df_cluster_test.cluster # Gerçek değerler liste olarak alınır.
print("Accuracy Score for Clustering",accuracy_score(true, pred)) # Modelin doğruluğu hesaplanır.

Accuracy Score for Clustering 0.8323293172690763


In [25]:
from sklearn.linear_model import LogisticRegression #lojistik regresyon modeli için gerekli olan kütüphane
lr=LogisticRegression()
model_lr=lr.fit(X_train,y_train)
prediction_lr=model_lr.predict(X_test)
print("Accuracy Score on traning data: ",lr.score(X_train,y_train))
print("Accuracy Score on test data: ",lr.score(X_test,y_test))
print("Accuracy Score: ",accuracy_score(y_test, prediction_lr))

Accuracy Score on traning data:  0.9337016574585635
Accuracy Score on test data:  0.9116465863453815
Accuracy Score:  0.9116465863453815


In [26]:
from sklearn.naive_bayes import MultinomialNB  #multinomial naive bayes modeli için gerekli olan kütüphane
mnb=MultinomialNB()
mnb.fit(X_train,y_train)
y_pred=mnb.predict(X_test)
print("Accuracy Score on traning data",mnb.score(X_train,y_train))
print("Accuracy Score on test data",mnb.score(X_test,y_test))
print("Accuracy Score: ",accuracy_score(y_test, y_pred))

Accuracy Score on traning data 0.8375188347564038
Accuracy Score on test data 0.8363453815261044
Accuracy Score:  0.8363453815261044


In [27]:
from sklearn import svm #sınıflandırma veya regresyon problemleri için kullanılabilen denetimli bir makine öğrenmesi algoritmasıdır
svc = svm.LinearSVC()
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print("Accuracy Score on traning data",svc.score(X_train,y_train))
print("Accuracy Score on test data",svc.score(X_test,y_test))
print("Accuracy Score: ",accuracy_score(y_test, y_pred))

Accuracy Score on traning data 0.9974886991461577
Accuracy Score on test data 0.9568273092369478
Accuracy Score:  0.9568273092369478
