In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/Colab_Notebooks/distrowatch NLP

/content/drive/MyDrive/Colab_Notebooks/distrowatch NLP


In [None]:
import requests
from pickle import dump
import pandas as pd
try:
  import scrapy
except:
  ModuleNotFoundError
  !pip install scrapy --quiet
  import scrapy


The process of information extraction will be exemplified with only one linux distribution

In [None]:
# distros = ['ubuntu','fedora','opensuse']

distro = 'ubuntu'
url_rating = f'https://distrowatch.com/dwres.php?resource=ratings&distro={distro}'
# response = scrapy.http.Response(url_rating)
response = requests.get(url_rating)
sel = scrapy.Selector(text=response.text)

In [None]:
total_reviews = sel.xpath('//td[@class = "News1"]//table[1]//td[2]/b[2]/text()').extract()
avg_rating = sel.xpath('//td[@class = "News1"]//table[1]//td[2]/div/text()').extract()
project = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[3]').extract()
version = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[4]').extract()
rating = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[5]').extract()
date_review = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[6]').extract()
votes = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[7]').extract()
reviews = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[2]/text()').extract()

In [None]:
reviews[0:10]

['\nUbuntu was the first distribution I used on Linux. Then I was a distrohopper for 4 years and learned everything about Debian/Ubuntu, Arch, Gentoo, Opensuse and other alternatives like Void, Slakware, etc.\r',
 "I always come back to Ubuntu. I don't like snaps or the continuous error in the snap-store, but I feel that it is the most stable, modern and efficient distro.\r",
 "When I want to learn about Linux, I use other distributions, but when I want to work with Linux, I always go back to Ubuntu. I use flatpak because I like it better, nobody prevents it even if you don't have flatpak pre-installed.\r",
 "Oh! and Ubuntu is the only distro I've installed on all types of hardware (old imac, new hardware, old hardware) and it always works. It just all works.\r",
 '\r',
 '\n',
 '\nI have used Ubuntu Linux for years, and I have had good experiences with it most time.\r',
 '\r',
 "Firstly, it's user friendly. For someone switching from Windows to Linux, Ubuntu is a good start, not with a

Different paragraphs in the same review are separated by `/r`, but different reviews are separated by `\n`. Also, there is another `\n` character at the beggining of the next review. In order to split the reviews correctly we can first "glue" all reviews and then use the pattern `'\n\n'` to separate reviews.

In [None]:
reviews = ''.join(reviews).split('\n\n')
assert len(reviews) == len(rating)

In [None]:
df_distro = pd.DataFrame({'date':date_review,
                          'project':project,
                          'version':version,
                          'rating':rating,
                          'votes':votes,
                          'review':reviews})
print(df_distro.shape)
df_distro.head()

(50, 6)


Unnamed: 0,date,project,version,rating,votes,review
0,2023-07-24,Ubuntu,22.04,9,1,\nUbuntu was the first distribution I used on ...
1,2023-07-09,Ubuntu,22.04,9,1,"I have used Ubuntu Linux for years, and I have..."
2,2023-07-03,Ubuntu,22.1,8,0,I like most of the Debian based system in the ...
3,2023-06-18,Ubuntu,22.04,8,10,(Read this if you're inexperienced with Linux)...
4,2023-06-17,Ubuntu,22.04,5,10,The DistroWatch review of 22.04 is completely ...


In [None]:
df_distro.loc[0,'review']

"\nUbuntu was the first distribution I used on Linux. Then I was a distrohopper for 4 years and learned everything about Debian/Ubuntu, Arch, Gentoo, Opensuse and other alternatives like Void, Slakware, etc.\rI always come back to Ubuntu. I don't like snaps or the continuous error in the snap-store, but I feel that it is the most stable, modern and efficient distro.\rWhen I want to learn about Linux, I use other distributions, but when I want to work with Linux, I always go back to Ubuntu. I use flatpak because I like it better, nobody prevents it even if you don't have flatpak pre-installed.\rOh! and Ubuntu is the only distro I've installed on all types of hardware (old imac, new hardware, old hardware) and it always works. It just all works.\r\r"

The problem with this approach is that we only get 50 reviews. This is probably because there is some dinamically generated content in the web. Only the first 50 reviews are present in the static html source code, and I guess the rest are loaded as you scroll. It would be necessary to use `selenium` to get the full content, but for this project I'll keep thing simple.

Now with more distros

In [None]:
url_popularity = 'https://distrowatch.com/dwres.php?resource=popularity'
sel = scrapy.Selector(text=requests.get(url_popularity).text)

distros = list(set(sel.xpath('//td[@class = "phr2"]/a/@href').extract()))

with open("distro_names", "wb") as f:
  dump(distros, f)
distros[:10]

['boss',
 'mabox',
 'murena',
 'baruwa',
 'kaos',
 'lfs',
 'guixsd',
 'runtu',
 'peppermint',
 'rds']

In [None]:
# distros = ['ubuntu','fedora','opensuse','mint','manjaro','debian','kali','endeavour','zorin','parrot','mx']
dfs = []
for distro in distros:
  url_rating = f'https://distrowatch.com/dwres.php?resource=ratings&distro={distro}'
  conn_timeout = 60
  read_timeout = 60
  response = requests.get(url_rating,timeout=(conn_timeout,read_timeout))
  sel = scrapy.Selector(text=response.text)

  project = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[3]').extract()
  version = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[4]').extract()
  rating = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[5]').extract()
  date_review = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[6]').extract()
  votes = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[1]/text()[7]').extract()
  reviews = sel.xpath('//td[@class = "News1"]//table[2]//tr/td[2]/text()').extract()

  if not reviews:
    print(distro + ':no reviews')
    continue

  # print(distro)
  reviews = ''.join(reviews).split('\n\n')
  df_distro = pd.DataFrame({'date':date_review,
                          'project':project,
                          'version':version,
                          'rating':rating,
                          'votes':votes,
                          'review':reviews})

  dfs.append(df_distro)

ubos:no reviews
rockstor:no reviews
max:no reviews
endian:no reviews
karoshi:no reviews
pld:no reviews
freedombox:no reviews
nst:no reviews
selks:no reviews
kolibri:no reviews
miracle:no reviews
omoikane:no reviews
zevenet:no reviews
rds:no reviews
volumio:no reviews
vine:no reviews
bicom:no reviews
ovios:no reviews
runtu:no reviews
nova:no reviews
canaima:no reviews
thinstation:no reviews
rss:no reviews
turnkey:no reviews
ob2d:no reviews
edubuntu:no reviews
primtux:no reviews
elearnix:no reviews
photonos:no reviews
recalbox:no reviews
omarine:no reviews
plop:no reviews
blueonyx:no reviews
baruwa:no reviews
bsdrp:no reviews
plamo:no reviews
univention:no reviews
pentoo:no reviews


In [None]:
df = pd.concat(dfs)
df = (
    df.assign(date=pd.to_datetime(df['date']),
              votes=pd.to_numeric(df['votes']),
              rating=pd.to_numeric(df['rating']))
    .reset_index(drop=True)
)

df.to_csv('distrowatch.csv') # 4th August 2023

df.head()

Unnamed: 0,date,project,version,rating,votes,review
0,2022-03-05,HardenedBSD,13-build-322,9,9,\nThe HardenedBSD project has corrected a lot ...
1,2023-07-12,OpenMandriva Lx,23.03,9,2,\nThis distro is very sadly under rated. I hav...
2,2023-07-07,OpenMandriva Lx,23.03,10,5,I to use Mandriva back in the old days but...
3,2023-04-07,OpenMandriva Lx,23.03,8,8,First I have to say I did not have any issues ...
4,2023-03-08,OpenMandriva Lx,23.01,10,8,"I had to choose the version as 23.01, but the ..."


In [None]:
df.groupby('project')['rating'].describe().sort_values(by='count',ascending=False)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
project,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Manjaro Linux,50.0,8.16,3.099440,1.0,7.25,10.0,10.0,10.0
Ubuntu MATE,50.0,8.28,2.491127,1.0,8.00,9.0,10.0,10.0
TrueNAS,50.0,9.06,1.621413,1.0,9.00,9.5,10.0,10.0
Debian,50.0,8.54,2.476090,1.0,8.00,10.0,10.0,10.0
Linuxfx,50.0,6.10,3.339620,1.0,3.00,6.0,9.0,10.0
...,...,...,...,...,...,...,...,...
Ubuntu Christian Edition,1.0,10.00,,10.0,10.00,10.0,10.0,10.0
ClearOS,1.0,8.00,,8.0,8.00,8.0,8.0,8.0
Untangle NG Firewall,1.0,4.00,,4.0,4.00,4.0,4.0,4.0
Uruk GNU/Linux,1.0,10.00,,10.0,10.00,10.0,10.0,10.0
