In [2]:
from subprocess import check_output

check_output(["ls", "input/"])

b'7all.csv\n'

# **In this NoteBook we will use python and its libraries to:**

   # 1-  Read Data from csv file,  extract words and then clean it
  #  2- Calculate the TF terms frequencies vector from the words
  #  3- Extract TF from each text to turn it to vector
  #  4- Train ml(machine leraning) model on the vectors and then test the model on other text Data.
  #  5- use the vector to summarize texts

In [3]:
import pandas as pd #to read csv files
import random 
import numpy as np #handle operations on arrays easily

#from the csv file read all the lines and make a table
dataset = pd.read_csv("input/7all.csv", encoding='utf-8', delimiter=',', names=["cat", "text"])

print("dataset has",len(dataset), "lines")




dataset has 4900 lines


# Samples from Dataset

In [None]:
samples = dataset.sample(3)
print(samples)

# Lets take a look at one text sample

In [None]:
print(samples.iloc[0].text)

# Calculate how many line does every category has.

In [None]:
cats = dataset.groupby("cat").size()
print(cats)

# NEXT
# Find words frequencies (how many time each word occurs in the text).

In [None]:
from collections import Counter

all_words = []
for idx,rows in dataset.iterrows():
    text = rows.text
    all_words.extend(text.split(" "))

words_freq = Counter(all_words)

print(words_freq)

# tools to process the text

In [3]:

#array of stop words to remove stop words from all the texts.
import pickle
infile = open("stopwords",'rb')
stopwords = pickle.load(infile)
infile.close()
#string of all turkish characters 
turkishCharaters = "abcçdefgğhıijklmnoöprsştuüvyz_"
#function use the 'turkishCharaters' list to eliminate symbols and strange characters.
def RealWord(string):
    if not any(c not in turkishCharaters for c in string):
        return True
    else:
        return False


'bir'

# Lets Start cleaning!!
# 1- Extract the word  with frequency bigger than threshold from    'words_freq' Dict 
# 2- Use the above tools to filter words and characters

# 3- [TODO] we could do Stemming or Lemmatizing 
#  "gidiyorum" -> "gitme - git" (lemmatizer) or "gid" (stemmer) 

In [None]:
#we will remove all words that has low frequency 
threshold = 30
words_freq = dict(filter(lambda x: x[1]>threshold, words_freq.items()))
#take all words from the dict
words = list(words_freq.keys())
#remove stop words.
filtered_words = [w for w in words if not w in stopwords]
#remove non turkish characters.
filtered_words = list(filter(RealWord, filtered_words))[1:]
print("the number of words is",len(filtered_words),"word, after we cleaned it.")
print("-"*50)
print(filtered_words)

# ****The NEXT CODE IS OPTIONAL****

# now we can head back to our previous words_freq dict and see the frequencies of our cleaned words.

In [None]:
#we just need to filter it with our new keys(filterd words)
new_freq = dict(filter(lambda x: x[0] in filtered_words, words_freq.items()))
#just sort words dict according to its frequencies 
new_freq = dict(sorted(new_freq.items(), key=lambda x: x[1], reverse=True))
print(new_freq)

# NOW we cleaned the words and stored it in "filtered_words" list

# NEXT we will apply TD algorithm on all the text of "Dataset":

# for every text: < -- LOOP

 #     1- Create init vector [01,02,...0n] <===> filtered_words [w1,w2...Wn]
 #     2- Loop through text words. init vector( [0,0....0] ==> [1,3,0,0,5..1])
 #     3- store init vector in all_vectors list

# The idea is to replace every word with a number corresponds to how many times it is occuring in the text

In [None]:
print("for the first ten words:\n")
print("filtered_words:",filtered_words[:10])
print("-"*50)
init_vector = [0]*len(filtered_words)
print("init_vector:",init_vector[:10])

In [None]:
#[0,0,0....0] ==> [1,4,2....2]
def create_vector(text):
    words_of_text = text.split(" ")
    text_vector = [0]*len(init_vector)
    for word in words_of_text:
        if word in filtered_words:
            idx = filtered_words.index(word)
            text_vector[idx] += 1
    return text_vector

all_vectors = []

#loop through all texts
for idx,row in dataset.iterrows():
    text = dataset.loc[idx, "text"]
    all_vectors.append(create_vector(text))


print("we now have",len(all_vectors),"vector.")
print("these vectors will be our input data to our model.")
print("-"*50)
print("first vector:",all_vectors[0])

# NOW we created vectors for all the texts and we grouped it in "all_vectors"
# NEXT train *SUPERVISED* machine learning model on "all_vectors":

#  supervised means maping inputs to outputs (X => Y) we have inputs which is "all_vectors" and if you looked at the "dataset"(the csv file) we see that each line has ("text","cat") so you can take the category of each line as its label.
# so the mapping function will be like this:
# for every ( *vector* in all_vectors && ("text","*cat*") in "Dataset"):

#         1.  model_dataset.add(vector,cat)

In [None]:
labels = dataset["cat"].tolist()

model_dataset = list(zip(all_vectors,labels))

print("one data sample:",model_dataset[0])

# Now we will shuffle "model_dataset" and split it to train and test sets

In [None]:
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
model_dataset_shuf = shuffle(model_dataset)
all_vectors_shuf,labels_shuf = zip(*model_dataset_shuf)

data_length = len(all_vectors)
train_x,train_y = all_vectors_shuf[:int(data_length*2/3)],labels_shuf[:int(data_length*2/3)]
test_x,test_y = all_vectors_shuf[int(data_length*2/3):],labels_shuf[int(data_length*2/3):]


print("total data size:",data_length)
print("Train data size:",len(train_x))
print("Test data size:",len(test_x))

# we are ready to train!!
# we just need to pick a model, I choose random forest but you can pick any model you want. 
# So let's train.

In [None]:
print ("Training the random forest...")

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# This may take a few minutes to run
forest = forest.fit( train_x, train_y )

print("training is finished.")

# training is finished, now lets take a look at its accuracy and see if the model learned something from the data or not.

In [None]:
from sklearn import metrics

def cal_acc(set_x,set_y):
    results = forest.predict(set_x)
    cnf_matrix = metrics.confusion_matrix(set_y, results)
    sim = 0
    for i in range(len(results)):
        if set_y[i] == results[i]:
            sim += 1
    return round(sim/len(results),2)

train_accuracy = cal_acc(train_x,train_y)
test_accuracy = cal_acc(test_x,test_y)

print("train accurcy:", train_accuracy)
print("test accurcy:", test_accuracy)

# Great!!, we archived good accuracy on both train and test sets
# Now this is a text from the internet it is obivuios that it is "siyaset" politcs lets run our model on this text

In [None]:
text = """ak_parti Genel  Başkanı ve Cumhurbaşkanı Recep Tayyip Erdoğan, Hamas Siyasi Büro Başkanı İsmail Heniyye ile telefon görüşmesi gerçekleştirdi.
Erdoğan, Hamas lideri Heniyye iler yaptığı telefon görüşmesinde Heniyye'ye, ağabeyi Halid Heniyye’nin vefatı dolayısıyla başsağlığı diledi"""
text_vector = create_vector(text)

In [None]:
results = forest.predict([text_vector])
print(results)

# The model predicted right!!
# **congratulation** you have now a model that predicts the category of the texts.

# we are almost done, now you are just going to implement other simple application using "filtered_words" list.

# we will do abstraction-based text summrization. 



In [None]:
text_to_summur = """Bu durum şu şekilde ortaya çıktı: Samsung Galaxy A8 Star’ın Hong Kong sürümünü tanıtmak amacıyla internet sitesinde yer verilen fotoğrafı çeken fotoğrafçı Dunja Djudjic, fotoğrafının EyeEm aracılığıyla Getty üzerinde satıldığına dair bilgilendirildi ve bunun üzerine hemen tersine bir fotoğraf araması yaptığında bu durumu farketti.
Üstelik bu olay, Samsung’un bu tarz bir şeyi ilk kez yapışı değildi. GSM Arena’nın belirttiğine göre, bu yılın başında Samsung Brezilya, Samsung Galaxy A8’i tanıtmak için resmi Twitter hesabından attığı -ve daha sonradan silinen- iki tweet’te stok fotoğraflar kullanmıştı. Ancak görünen o ki, gerçekten söz konusu telefonla çekilmemiş bir fotoğrafın üzerine Samsung filigranı eklemek ve daha sonrasında da bir akıllı telefonun tanıtımını yapmak için bu fotoğrafı kullanmak, Samsung açısından yanıltıcı reklam yapmak anlamına gelmiyor.
Samsung’dan Apple ile Dalga Geçen Reklam Filmleri Serisi Buradaki problem, Samsung’un reklamda kullandığı fotoğrafın tanıtımını yaptığı telefon ile çekilmediğini açıkça belirtmiyor olmasıdır. Tabii, bunu belirtmesinin ne kadar mantıklı olacağı da tartışılır. Zira böyle bir durumda birçok kişi, akıllı telefonun kamerasına vurgu yaptığı bir reklamda stok fotoğraf kullandığı için Samsung’u eleştirecektir.
Samsung’un yanı sıra, Çinli akıllı telefon üreticisi Huawei de geçtimiz aylarda Huawei Nova i3 model akıllı telefonunu tanıtmak için aynı yöntemi uygulamış ve yakalanmıştı. Huawei ise buna karşılık olarak yaptığı açıklamada, “ürün çekimlerinin sadece referans amaçlı olduğunu” tekrarlayarak bu uygulamayı haklı göstermeye çalışmıştı. Bu açıdan bakacak olursak, bu zamana kadar diğer akıllı telefon üreticilerinin de telefon kamerasına vurgu yaptıkları reklamlarda stok fotoğraf kullanmış olma ihtimalleri de bir hayli yüksek."""


# First we need to tokenize text to sentences we will do it using nltk library

In [None]:
from nltk import sent_tokenize
sents = sent_tokenize(text_to_summur)
print("we have", len(sents),"in the text")
print(sents)

# next we will make TF vector for each sentnce (calculating frequencies of the words of the sentence)

In [None]:
sents_vecs = []
for sent in sents:
    sents_vecs.append(create_vector(sent))
print(sents_vecs)


# Now lets sum the frequencies of all trems for each vector

In [None]:
sums = []
for vec in sents_vecs:
    sums.append(sum(vec))

list_to_print = ["sent{}".format(i) for i in range(len(sents))]
print(list_to_print)
print(sums)


# after we sumed up all terms frequencies for each vector we choose the indices which are not zeros in our case [3,6,8]
# and finaly we choose the sents (3,6,8) as the summarization of the text

In [None]:
idx = np.nonzero(sums)[0]
sents = np.array(sents)

print("summarization:")
print("-"*50)
print("\n".join(sents[idx]))

# **WE ARE DONE !!** that's all.
# I want thank you for completing this tutorial and i hope you found it usefull and well organized, and I wish you a great day.

# Happy coding
