# Getting data for analysis in R

__Hypothesis__:  There will be a correlation between the mass of a protein, and its status as being "secreted" or "non-secreted".  

To test this hypothesis, we are going to SPARQL the UniProt endpoint for a particular organism (Magnaporthe oryza) and collect all of the "secreted" proteins, and a sample of the "non-secreted" proteins.  these will be written to a comma-separated file that we will use in the next lesson on the language "R".

The query for this is a bit more complex than any queries we have seen before.  __Why?__  *(there are two reasons!)*

I have written the script for you below.  
<pre>


</pre>


In [9]:

require 'sparql/client'

endpoint = "http://sparql.uniprot.org/sparql"  # what location are we querying?

query_secreted = <<END
PREFIX up:<http://purl.uniprot.org/core/> 
PREFIX taxon:<http://purl.uniprot.org/taxonomy/> 
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> 

select distinct ?s ?mass ?sequence
where {
  ?s up:classifiedWith ?go .
  ?s up:classifiedWith/rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0046903> .
  ?s up:organism/rdfs:subClassOf <http://purl.uniprot.org/taxonomy/318829> .
  
  
  ?s up:sequence ?seq .
  ?seq up:mass ?mass .
  ?seq rdf:value ?sequence
} LIMIT 1000

END

query_not_secreted = <<END2
PREFIX up:<http://purl.uniprot.org/core/> 
PREFIX taxon:<http://purl.uniprot.org/taxonomy/> 
PREFIX rdf:<http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> 

select distinct ?s ?mass ?sequence
where {
  ?s up:classifiedWith ?go .
  FILTER NOT EXISTS {
      ?s up:classifiedWith/rdfs:subClassOf <http://purl.obolibrary.org/obo/GO_0046903> .
  }
  ?s up:organism/rdfs:subClassOf <http://purl.uniprot.org/taxonomy/318829> .
  
  
  ?s   up:sequence ?seq .
  ?seq up:mass ?mass .
  ?seq rdf:value ?sequence
}  LIMIT 1000

END2


sparql = SPARQL::Client.new(endpoint)  # create a SPARQL client
File.open("/home/osboxes/UPM_BioinfoCourse/Lectures/files/secretome.csv", "a") { |file|
  # create the header line
  file.write "mass,length,protein_status\n" 

  result = sparql.query(query_secreted)  # Execute query
  result.each do |solution|
    puts "#{solution[:mass].value},#{solution[:sequence].value.length},secreted" 
    file.write "#{solution[:mass].value},#{solution[:sequence].value.length},secreted\n" 
  end

  result = sparql.query(query_not_secreted)  # Execute query
  result.each do |solution|
    puts "#{solution[:mass].value},#{solution[:sequence].value.length},not_secreted" 
    file.write "#{solution[:mass].value},#{solution[:sequence].value.length},not_secreted\n" 
  end
  
  
  }


  





69848,630,secreted
103777,943,secreted
20728,187,secreted
21334,193,secreted
163556,1477,secreted
94596,851,secreted
88085,755,secreted
75719,680,secreted
96662,868,secreted
17827,158,secreted
69848,630,secreted
88085,755,secreted
94596,851,secreted
21334,193,secreted
163556,1477,secreted
75719,680,secreted
10854,95,secreted
75719,680,secreted
22733,206,secreted
20728,187,secreted
158577,1433,secreted
88085,755,secreted
94596,851,secreted
6762,60,secreted
79563,704,secreted
21034,191,secreted
70041,632,secreted
21334,193,secreted
104132,946,secreted
61028,547,not_secreted
11291,101,not_secreted
54412,471,not_secreted
26967,236,not_secreted
44920,396,not_secreted
43036,390,not_secreted
46756,433,not_secreted
14300,136,not_secreted
14845,137,not_secreted
44518,400,not_secreted
88461,787,not_secreted
29275,256,not_secreted
6496,56,not_secreted
21514,189,not_secreted
11356,103,not_secreted
2760,25,not_secreted
34591,323,not_secreted
33116,313,not_secreted
81984,734,not_secreted
54922,488,n

268230,2503,not_secreted
79341,735,not_secreted
56217,533,not_secreted
175447,1581,not_secreted
33435,314,not_secreted
37151,328,not_secreted
55618,509,not_secreted
46527,424,not_secreted
155486,1431,not_secreted
55654,516,not_secreted
31998,298,not_secreted
76525,716,not_secreted
30598,282,not_secreted
53521,475,not_secreted
55046,504,not_secreted
53571,488,not_secreted
151683,1421,not_secreted
37620,353,not_secreted
63538,574,not_secreted
59883,525,not_secreted
302158,2684,not_secreted
168527,1513,not_secreted
80148,727,not_secreted
99555,920,not_secreted
50371,462,not_secreted
90792,792,not_secreted
30351,275,not_secreted
64239,579,not_secreted
78074,713,not_secreted
73396,645,not_secreted
49106,428,not_secreted
27422,239,not_secreted
50490,455,not_secreted
79475,712,not_secreted
22338,202,not_secreted
62861,556,not_secreted
18961,177,not_secreted
64935,594,not_secreted
73938,674,not_secreted
48162,432,not_secreted
19136,176,not_secreted
83621,755,not_secreted
126746,1127,not_secret

7743,67,not_secreted
91509,820,not_secreted
29330,274,not_secreted
65054,594,not_secreted
129108,1138,not_secreted
39240,345,not_secreted
24929,220,not_secreted
108072,1000,not_secreted
71821,656,not_secreted
143508,1307,not_secreted
49278,452,not_secreted
33241,298,not_secreted
28719,256,not_secreted
175156,1593,not_secreted
22705,211,not_secreted
90620,812,not_secreted
62749,581,not_secreted
53608,471,not_secreted
57682,563,not_secreted
149179,1337,not_secreted
93130,864,not_secreted
118367,1059,not_secreted
75242,667,not_secreted
55864,511,not_secreted
168050,1529,not_secreted
166585,1533,not_secreted
50888,482,not_secreted
46239,424,not_secreted
84702,765,not_secreted
52679,465,not_secreted
40802,376,not_secreted
98141,897,not_secreted
279107,2556,not_secreted
134856,1225,not_secreted
19354,181,not_secreted
99962,885,not_secreted
173522,1664,not_secreted
151735,1401,not_secreted
18347,166,not_secreted
88949,789,not_secreted
83077,783,not_secreted
97650,888,not_secreted
74928,675,no

[#<RDF::Query::Solution:0x2ac433dbbc18({:sequence=>#<RDF::Literal:0x2ac433c30f74("MASKNMVNPAVEPSMEDDLFAREVAEVKQWWSDPRWRYTKRPFTAEQIVSKRGNLKIEYPSNAQSKKLWKILEGRFQKRDASYTYGCLEPTMVTQMAKYLDTVYVSGWQSSSTASSSDEPGPDLADYPYTTVPNKVSHLFMAQLFHDRKQRHERLSAPKSERSKLQNIDYLRPIIADADTGHGGLTAVMKLTKLFIEKGAAGIHIEDQAPGTKKCGHMAGKVLVPINEHINRLVAIRAQADIMGVDLLAIARTDAEAATLITTSIDPRDHAFILGCTNPSLQPLADLMNTAEQSGKTGDQLQAIEDEWMAKANLKRFDDAVVDVINSSSSIRNPKDVAAKYLQAAKGKSNREARAIASSLGVPEIFFDWDSPRTREGYFRIKGGCDCAINRAIAYAPYADAIWMESKLPDYEQAKEFAEGVHAVYPEQKLAYNLSPSFNWKTAMPRDEQETYIRRLAGLGYCWQFITLAGLHTTALISDRFARAYSEVGMRAYGELVQEPEMELGVDVVKHQKWSGATYVDELQKMVTGGVSSTAAMGKGVTEDQFH")>, :s=>#<RDF::URI:0x2ac433c3072c URI:http://purl.uniprot.org/uniprot/L7HUY5>, :mass=>#<RDF::Literal:0x2ac4339efcec("61028"^^<http://www.w3.org/2001/XMLSchema#int>)>})>, #<RDF::Query::Solution:0x2ac433dbb1c8({:sequence=>#<RDF::Literal:0x2ac433dbbb00("MSAFGGPGGGQINQKPIPPQRGSFPLDHDGECKHVMTSYLACMKKVRGVNDNECREFAKSYLACRMDHNLMARDEFKNLGFQDVKDSSNKGDPAKKGELRW")>, :s=>#<RDF::UR