In [None]:
import requests
from bs4 import BeautifulSoup

def fetch_discussion_page(url):
    page = requests.get(url)
    if (page.status_code == 200):
        soup = BeautifulSoup(page.content, "html.parser")
        # print(soup.prettify())

    else :
        print("Error fetching the discussion_page")
        return None, None, None

    # title = soup.title.text
    try:
        title = soup.find(attrs={"id": "content-post-title"}).text.strip()
    except:
        title = soup.title.text.strip()
    desc, *reply = soup.find_all(attrs={"class" : "content-post-body-content"})

    """print(soup.find(attrs={"id": "content-post-title"}).text.strip())
    print(desc.text)
    print(reply)  """

    ##
    # content-post-title
    # content-post-body-content
    # content-post-body-content
    ##

    desc = desc.text.strip()
    reply = tuple(set(i.text.strip() for i in reply))

    return title, desc, reply

In [None]:
fetch_discussion_page(url = "https://discussions.apple.com/thread/253910868")

('Apple keeps losing my trade ins…',
 'For the 2nd time, for the second iPhone trade in, I’ve received an email from apple stating my trade in was canceled and I will be charged… I’ve spent WAY TOO much time on these issues talking to customer service on the phone, via text and chat, and they still can’t figure this out. Provided BOTH times, photos of tracking numbers, the fed ex box drop off location and the fed ex tracking receipt showing THEY received them. Why does this keep happening and how do I resolve this?',
 ('Never had an issue, but contact Apple Support.',))

In [None]:
import pickle

discussion_set = set()

with open("discussion.links", "r") as d :
    for link in d.readlines() :
        link = link.strip()
        try :
            title, desc, reply = fetch_discussion_page(link)
            discussion_set.add((title, desc, reply))
        except :
            pass

print(len(discussion_set))

3292


In [None]:
pickle.dump(discussion_set, open("discussion_set.pkl", "wb"))

In [None]:
import pandas as pd

df = pd.DataFrame(discussion_set, columns=["title", "desc", "reply"])
df.head()

Unnamed: 0,title,desc,reply
0,iphone tradein & non-functioning FaceID,I want to trade my iPhone X and upgrade to iPh...,(Thanks for the reply. This isn't an OS update...
1,Iphone Trade-In Help,Hi all and hope someone can help? I just bough...,"(Hello,This link should answer your questions:..."
2,OS X Yosemite - resetting my MacBook to trade in,I am resetting my MacBook to its factory setti...,(You need to erase the whole physical SSD (Int...
3,How to change iPhone model of a scheduled trad...,I’ve purchased a new pine 13 online and provid...,"(Start here.Contact - Official Apple Support, ..."
4,iPhone trade-in,I’m about to buy an iPhone XS and i liked the ...,(When I went to the site and looked up yours T...


In [None]:
df.to_csv("discussion_set.csv", index=False)

In [None]:
df["title"].value_counts()

Unnamed: 0_level_0,count
title,Unnamed: 1_level_1
Trade in,204
iPhone trade in,87
trade in,67
Trade In,22
iphone trade in,22
...,...
"I am trading in my phone, should I erase my phone after backup?",1
Can I trade up my ipad (MD785LL/B)? How do I retain/transfer my files?,1
"Experienced Apple Trade-In User, Received Returned iPad 'Unavailable' Locked",1
Will Apple Store take a “locked to owner” device as trade in?,1


In [None]:
min(df["desc"].apply(len)), max(df["desc"].apply(len))

(0, 54899)

In [None]:
min(df["reply"].apply(len)), max(df["reply"].apply(len))

(0, 15)

In [None]:
df['reply'].apply(len).value_counts()

Unnamed: 0_level_0,count
reply,Unnamed: 1_level_1
1,1971
2,596
3,295
4,129
0,110
5,63
6,34
7,19
15,18
8,16


In [None]:
df.isna().value_counts(), df["desc"].isnull().value_counts(), df["reply"].isnull().value_counts(), df["title"].isnull().value_counts()

(title  desc   reply
 False  False  False    3292
 Name: count, dtype: int64,
 desc
 False    3292
 Name: count, dtype: int64,
 reply
 False    3292
 Name: count, dtype: int64,
 title
 False    3292
 Name: count, dtype: int64)

# Preprocessing
- data in discussion_set.pkl pickle file

In [12]:
import pickle

training_list = []

discussion_set = list(pickle.load(open("discussion_set.pkl", "rb")))

In [13]:
for i in discussion_set :
    title, desc, replies = i

    for reply in replies :
        training_list.append({"title": title, "desc": desc, "reply": reply})

print(len(training_list))

6045


In [14]:
training_list[0:5]

[{'title': 'Apple trade in box',
  'desc': 'Hello, the parcel for the trade in box got damaged, can I request a new one please.',
  'reply': 'This is a community forum where Apple users come together to share tips on Apple devices and troubleshoot issues within the boundaries of the Apple ecosystem**.   Here, you can:Dive into meaningful discussions:\xa0Get insights, share your experiences, and learn from fellow Apple enthusiasts about Apple productsUnlock the potential of your devices:\xa0Discover hidden features, troubleshoot problems, and get the most out of your iPhone, Mac, or any other Apple product.Tap into a wealth of knowledge:\xa0Our community is full of Apple experts who are happy to share their wisdom and help you navigate the Apple ecosystem**.Apple moderates the forum to ensure a friendly and productive environment, but the real magic happens through user-to-user interaction. So, feel free to explore existing discussions, ask questions, and contribute your own expertise o

In [15]:
import pandas as pd

training_data = pd.DataFrame(training_list, columns=["title", "desc", "reply"])

In [16]:
training_data.info()
print("\nMissing values:")
print(training_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6045 entries, 0 to 6044
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   6045 non-null   object
 1   desc    6045 non-null   object
 2   reply   6045 non-null   object
dtypes: object(3)
memory usage: 141.8+ KB

Missing values:
title    0
desc     0
reply    0
dtype: int64


In [17]:
import re
import unicodedata

training_data.dropna(inplace=True)
training_data.drop_duplicates(inplace=True)


def clean_text(text):
    if isinstance(text, str):
        text = text.strip()

        text = re.sub(r'\n+', ' ', text)
        text = re.sub(r'\s+', ' ', text)

        # Normalize unicode characters
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    return text

for col in ['title', 'desc', 'reply']:
    if col in training_data.columns:
        training_data[col] = training_data[col].apply(clean_text)

training_data.head()

Unnamed: 0,title,desc,reply
0,Apple trade in box,"Hello, the parcel for the trade in box got dam...",This is a community forum where Apple users co...
1,Trade in on my iPhone,I just wanted to know if u trade in ur phone d...,No. See the following from Apple Trade In - Ap...
2,iPhone Trade In,If I wanted to get the iPhone 11 and I selecte...,"Hi mindyask, Thank you for posting in the Appl..."
3,iPhone Repair Trade Ins,If my card is put on hold for a new phone to b...,"Hello Bbennett96,Welcome to Apple Support Comm..."
4,IPHONE TRADE IN,"Hi, I just upgraded 2 Iphones, I would like to...",It isn't required but it will go faster if you...


In [18]:
training_data.to_csv("training_data.csv", index=False)