In [2]:
from bs4 import BeautifulSoup
import requests 
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from farasa.stemmer import FarasaStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from app import TextClassification

> __Scraping Final Functions__

In [26]:
def scrap(max_num = 10):
    pnum = 1
    y7_link = "https://www.youm7.com"

    titles = []
    dates = []
    brief_articles = []
    links = []
    imgs = []

    while pnum <= max_num:
        result = requests.get(f"https://www.youm7.com/Section/%D8%A3%D8%AE%D8%A8%D8%A7%D8%B1-%D8%A7%D9%84%D8%B1%D9%8A%D8%A7%D8%B6%D8%A9/298/{pnum}")
        src = result.content
        soup = BeautifulSoup(src, "lxml")

        for i in range(len(soup.find_all("div", {"class" : "col-xs-12 bigOneSec"}))):
            try:
                titles.append(soup.find_all("div", {"class" : "col-xs-12 bigOneSec"})[i].img.attrs["alt"].strip())
                dates.append(soup.find_all("div", {"class" : "col-xs-12 bigOneSec"})[i].span.text.strip())
                brief_articles.append(soup.find_all("div", {"class" : "col-xs-12 bigOneSec"})[i].p.text.strip())
                links.append(y7_link + soup.find_all("div", {"class" : "col-xs-12 bigOneSec"})[i].a.attrs["href"].strip())
                imgs.append(soup.find_all("div", {"class" : "col-xs-12 bigOneSec"})[i].find("img").attrs['src'])
                
            except:
                print(i, 'are passed')
        
        pnum += 1  # end of while loop
      
    dic = {
    'titles' : titles,
    'dates': dates,
    'brief_articles': brief_articles,
    'links': links,
    'imgs': imgs   
    }
    
    df = pd.DataFrame(dic)
    
    creators = []
    key_words = []
    for link in df.links:
        result = requests.get(link)
        src = result.content
        soup = BeautifulSoup(src, "lxml")
        creator = soup.find("span", {"class" : "writeBy"}).text.strip().split()[:]

        for c in ["كتبت" ,"كتب" ,"إعداد"] :
             if c in creator:
                    creator = creator[1:]
        keys = []
        for k in soup.find("div", {"class": "tags"}).find_all("h3"):
            keys.append(k.a.text.strip())

        key_words.append(" ".join(keys))
        creators.append(" ".join(creator))

    df["key_words"] = key_words
    df["creators"] = creators
    df["text"] = df.titles + " " + df.creators + " " + df.key_words + " " + df.brief_articles
    
    df.drop_duplicates(inplace = True)
    return df

In [27]:
df = scrap(1)

In [28]:
df.to_csv('y7.csv')

In [5]:
class TextClassification:
    def __init__(self, df = None):
        self.df = df
        self.tokenizer = RegexpTokenizer(r'\w+')  # return alphanumerics only
        self.fst = FarasaStemmer(interactive = True)  # farasa stemmer
        self.cv = CountVectorizer()
        self.nn = NearestNeighbors()
        
        self.apply_clean()
        self.train()
        
    def clean(self, text):
        cleaned = []
        for w in self.tokenizer.tokenize(text):
            if w in stopwords.words('Arabic') or len(w) == 1:
                continue
            w = self.fst.stem(w)
            cleaned.append(w)
        return ' '.join(cleaned)
    
    def apply_clean(self):
        self.df['clean_text'] = self.df['text'].apply(self.clean)
    
    def train(self):
        x = self.cv.fit_transform(self.df.clean_text).toarray()
        self.nn.fit(x)
        distances, indices = self.nn.kneighbors(x)
        self.urls = pd.DataFrame(indices)
        self.urls = self.urls.applymap(lambda x: self.df.iloc[x].links)
        
    def predict(self, url):
        res = self.urls[self.urls[0] == url].values[0]
        return res

In [6]:
df = pd.read_csv('y7.csv')

In [7]:
df.drop('Unnamed: 0', axis=1, inplace=True) 

In [8]:
tc = TextClassification(df)



In [9]:
tc.predict('https://www.youm7.com/story/2022/4/28/الجهاز-الفنى-للمقاولون-يتابع-لقاء-المصرى-والزمالك-استعداداً-لموقعة-الأحد/5743171')

array(['https://www.youm7.com/story/2022/4/28/الجهاز-الفنى-للمقاولون-يتابع-لقاء-المصرى-والزمالك-استعداداً-لموقعة-الأحد/5743171',
       'https://www.youm7.com/story/2022/4/28/مدرب-سيراميكا-جاهزون-للفوز-على-الأهلى-فى-المباراة-القادمة/5743645',
       'https://www.youm7.com/story/2022/4/28/أهداف-مباراة-بيراميدز-وسيراميكا-بالدورى/5743600',
       'https://www.youm7.com/story/2022/4/28/هيثم-شعبان-رحيلى-عن-سيراميكا-شائعات-والفوز-على-بيراميدز-مهم/5743606',
       'https://www.youm7.com/story/2022/4/28/سيراميكا-يحقق-أول-فوز-فى-2022-بعد-غياب-13-مباراة/5743596'],
      dtype=object)

In [12]:
df[df.links == 'https://www.youm7.com/story/2022/4/28/الجهاز-الفنى-للمقاولون-يتابع-لقاء-المصرى-والزمالك-استعداداً-لموقعة-الأحد/5743171']

Unnamed: 0,titles,dates,brief_articles,links,imgs,key_words,creators,text,clean_text
0,الجهاز الفنى للمقاولون يتابع لقاء المصرى والزم...,الخميس، 28 أبريل 2022 04:00 ص,أغلق الجهاز الفني لفريق المقاولون العرب، صفحة ...,https://www.youm7.com/story/2022/4/28/الجهاز-ا...,https://img.youm7.com/medium/20220117121948194...,محمد عودة المقاولون اخبار المقاولون المقاولون ...,أسماء عمر,الجهاز الفنى للمقاولون يتابع لقاء المصرى والزم...,جهاز فنى مقاولون تابع لقاء مصرى زمالك استعداد ...


In [3]:
df = pd.read_csv("y7.csv")

In [4]:
df.links

0     https://www.youm7.com/story/2022/4/30/الاتحاد-...
1     https://www.youm7.com/story/2022/4/30/7-أخبار-...
2     https://www.youm7.com/story/2022/4/30/نوستالجي...
3     https://www.youm7.com/story/2022/4/30/هيثم-شعب...
4     https://www.youm7.com/story/2022/4/30/موعد-مبا...
5     https://www.youm7.com/story/2022/4/30/المصرى-ي...
6     https://www.youm7.com/story/2022/4/30/كل-ما-تر...
7     https://www.youm7.com/story/2022/4/30/مدرب-الإ...
8     https://www.youm7.com/story/2022/4/30/رياضيون-...
9     https://www.youm7.com/story/2022/4/30/أسباب-اس...
10    https://www.youm7.com/story/2022/4/30/سيراميكا...
11    https://www.youm7.com/story/2022/4/30/التشكيل-...
12    https://www.youm7.com/story/2022/4/30/بره-المل...
13    https://www.youm7.com/story/2022/4/30/الإسماعي...
14    https://www.youm7.com/story/2022/4/30/بيراميدز...
15    https://www.youm7.com/story/2022/4/30/بالأرقام...
16    https://www.youm7.com/story/2022/4/30/ماذا-قدم...
17    https://www.youm7.com/story/2022/4/30/اعرف

In [6]:
df.links[0]

'https://www.youm7.com/story/2022/4/30/الاتحاد-السكندرى-يبحث-عن-أول-فوز-فى-2022-أمام-المحلة/5746304'

In [9]:
'https://www.youm7.com/story/2022/4/30/الاتحاد-السكندرى-يبحث-عن-أول-فوز-فى-2022-أمام-المحلة/5746304' not in df.links

False