From 21782a39945df2262347673d4768c61a9e68e844 Mon Sep 17 00:00:00 2001 From: DavyWk Date: Fri, 28 Nov 2014 22:03:11 -0500 Subject: [PATCH] Added MIME parsing --- MailParser.cs | 242 ++++---------------------------------------------- MimeParser.cs | 158 ++++++++++++++++++++++++++++++++ POP3Client.cs | 7 +- POPLib.csproj | 6 +- POPMessage.cs | 1 - 5 files changed, 182 insertions(+), 232 deletions(-) create mode 100644 MimeParser.cs diff --git a/MailParser.cs b/MailParser.cs index faa6a8e..2895514 100644 --- a/MailParser.cs +++ b/MailParser.cs @@ -1,6 +1,7 @@ using System; -using System.Globalization; +using System.Linq; using System.Text; +using System.Globalization; using System.Collections.Generic; using Utils; @@ -13,15 +14,15 @@ internal class MailParser { public POPMessage Message { get; private set; } - private List lines; + private readonly List lines; public MailParser(List messageLines) { lines = messageLines; - POPMessage m = new POPMessage(); + var m = new POPMessage(); m.Raw = lines; - + foreach(var l in lines) { // Just in case. @@ -39,8 +40,6 @@ public MailParser(List messageLines) // Date MUST be in lowercase else if(lowered.StartsWith("date:")) m.ArrivalTime = GetDate(lowered); - else if(lowered.StartsWith("content-type:")) - m.CharSet = GetEncoding(trimmed); else if(string.IsNullOrWhiteSpace(trimmed)) { int currentLine = lines.IndexOf(l); @@ -48,11 +47,12 @@ public MailParser(List messageLines) break; } } - + + m.Body = new MimeParser(lines).GetBody(); + // Some SMTP sever don't send all the fields. m.Subject = MailParsingUtils.CompleteSubject(m.Subject); - if(m.CharSet == null) - m.CharSet = Encoding.UTF8; + if(m.ID == string.Empty) m.ID = "NO ID"; if(m.Receivers == null) @@ -60,8 +60,7 @@ public MailParser(List messageLines) m.Receivers = new List(); m.Receivers.Add(new Person("ERROR", "ERROR")); } - - m.Body = GetBody(m.CharSet); + m.ContainsHTML = CheckForHTML(); Message = m; @@ -75,7 +74,7 @@ private static string GetID(string s) private Person GetSender(string s) { - Person p = new Person(); + var p = new Person(); // In case there's something interesting on // the following line. int offset = lines.IndexOf(s); // "From:" @@ -149,7 +148,7 @@ private List GetReceivers(string s) index = 0; // Handles multiple receivers. - var nextLine = lines[++offset]; + string nextLine = lines[++offset]; int extraChars = MailDecoder.StartsWith(nextLine); while(extraChars > 0) { @@ -169,7 +168,7 @@ private List GetReceivers(string s) if((index = s.IndexOfAny(delimitor, index)) > -1) { - Person receiver = new Person(); + var receiver = new Person(); receiver.Name = s.SubstringEx('"', '"', index); if(receiver.Name != string.Empty) @@ -373,214 +372,11 @@ private static DateTime GetDate(string s) return dt; } - private Encoding GetEncoding(string s) - { // Encoding.UTF8 is the default encoding. - - string encoding; - int index = s.IndexOf("charset=") + 8; // 8: size of charset= - - if(index == 0) - return Encoding.UTF8; - - encoding = s.SubstringEx('"','"',index); - - // In case there is no quotation marks. - if(encoding == string.Empty) - encoding = s.Substring(index, s.Length - index); - - if(encoding.Contains(";")) - { - index = encoding.IndexOf(';'); - encoding = encoding.Substring(0, index); - } - - if(s.Contains("charset=")) - return Encoding.GetEncoding(encoding); - else - return Encoding.UTF8; - } - - private string GetBody(Encoding charset = null) - { - // "Compile-time constant". - if(charset == null) - charset = Encoding.UTF8; - - var lBody = new List(); - var body = string.Empty; - int bodyStart = Int32.MaxValue; - int lastEmpty; - - bool contentEncoded = false; - if(lines - .IndexOf("Content-Transfer-Encoding: base64") != -1) - contentEncoded = true; - - if(contentEncoded) - { - bodyStart = lines.IndexOf(string.Empty) + 1; - - while(lines[bodyStart].StartsWith("--") || - lines[bodyStart + 1].StartsWith("--")) - { - bodyStart = lines.IndexOf(string.Empty, bodyStart) + 1; - } - lastEmpty = lines.IndexOf(string.Empty, bodyStart); - // lastEmpty counts ONLY if the next line - // is part of the MIME format. - if(!(lastEmpty != -1) && (lines[lastEmpty+1].StartsWith("--"))) - lastEmpty = lines.Count; - - lBody = lines.GetRange(bodyStart, lastEmpty - bodyStart); - string encodedBody = string.Join("", lBody.ToArray()); - string decodedBody = charset.GetString( - Convert.FromBase64String(encodedBody)); - string[] decodedArray = decodedBody.Split( - Environment.NewLine.ToCharArray(), - StringSplitOptions.RemoveEmptyEntries); - - // Replaces the encoded ones with the decoded ones - lines.RemoveRange(bodyStart, lines.Count - bodyStart); - lines.InsertRange(bodyStart, decodedArray); - lBody = new List(decodedArray); - - return string.Join(string.Empty, lBody.ToArray()); - } - - int htmlBegin = -1; - int htmlEnd = -1; - for(int i = 0; i < lines.Count; i++) - { - string current = lines[i]; - - if((i > bodyStart) && current.StartsWith("Content-Type:") - && (charset == Encoding.UTF8)) - { - charset = GetEncoding(current); - } - - if(i > bodyStart) - { - if((htmlBegin == -1) - && (current.StartsWith("") || - current.StartsWithEx("")) - { - htmlEnd = i; - break; - } - - // Sometimes lines end with = sign. - if(current.EndsWith("=")) - current = current.Remove(current.Length - 1 , 1); - current = MailDecoder.DecodeSpecialChars(current, charset); - - // Just in case there is no HTML. - lBody.Add(current); - - lines[i] = current; - // If bodyStart is already set, don't need to check again. - continue; - } - - // Gets the first empty line (end of the headers). - if(string.IsNullOrWhiteSpace(current)) - { - // Skips blank lines - - int offset = i + 1; - while(string.IsNullOrWhiteSpace(lines[offset])) - offset++; - - bodyStart = offset; - } - } - - if((htmlBegin != -1) && (htmlBegin < htmlEnd)) - { - lBody = lines.GetRange(htmlBegin, htmlEnd - htmlBegin); - } - - - // If there is no HTML. - if(lBody.Count == 0) - { - int emptyLine = 0; - foreach(var l in lines) - { - int currentOffset = lines.IndexOf(l); - if(emptyLine == 0) - { - if(string.IsNullOrWhiteSpace(l)) - { - emptyLine = currentOffset + 1; - break; - } - } - - if(currentOffset > emptyLine) - { - string current = lines[currentOffset]; - // Sometimes lines end with = sign. - if(current.EndsWith("=")) - current = current.Remove(current.Length - 1 , 1); - current = MailDecoder.DecodeSpecialChars(current, - Message.CharSet); - } - } - - lBody = lines.GetRange(emptyLine, lines.Count - emptyLine); - } - - - body = string.Join(string.Empty, lBody.ToArray()); - - return body; - } - - private bool CheckForHTML() - { - var kv = CheckForHtml(); - - if((kv.Key != -1) && (kv.Value != -1)) - return true; - else - return false; - } - - private KeyValuePair CheckForHtml() - { - int begin = Int32.MaxValue; - int end = Int32.MaxValue; - - for(int i = 0; i < lines.Count; i++) - { - if((end == Int32.MaxValue) && lines[i].StartsWith("") || - lines[i].StartsWithEx(" end)) - { - begin = -1; - end = -1; - } - - return new KeyValuePair(begin, end); - } - + private bool CheckForHTML() + { + return (from line in lines + where line.StartsWith("Content-Type: text/html;") + select line).Count() > 0; + } } } \ No newline at end of file diff --git a/MimeParser.cs b/MimeParser.cs new file mode 100644 index 0000000..ab61f2d --- /dev/null +++ b/MimeParser.cs @@ -0,0 +1,158 @@ +using System; +using System.IO; +using System.Text; +using System.Linq; +using System.Net.Mime; +using System.Collections.Generic; + + +namespace POP +{ + class MimeParser + { + private readonly List lines; + private readonly string boundary; + + public MimeParser(List messageLines) + { + lines = messageLines; + + boundary = ExtractBoundary(); + } + + public string GetBody() + { + var body = new List(); + foreach (List section in FindSections()) + body.Add(Decode(section)); + + return string.Concat(body); + } + + private List> FindSections() + { + var sections = new List>(); + var indexes = new List(); + + for (int i = 0; i < lines.Count; i++) + { + if (lines[i].StartsWith(boundary)) + indexes.Add(i); + } + + var boundaries = new Dictionary(); + + for (int i = 0; i < indexes.Count - 1; i++) + { + boundaries.Add(indexes[i], indexes[i + 1]); + } + + foreach (var kv in boundaries) + sections.Add(lines.GetRange(kv.Key, kv.Value - kv.Key)); + + // Sometimes there is a plaintext and a html section + + if (sections.Count < 2) + return sections; + + // If the first section is text and the second is HTML then delete the first section because it is + // most likely the same thing but in a different format. + if (sections[0][1].Contains("Content-Type: text/plain;") && + sections[1][1].Contains("Content-Type: text/html;")) + sections.RemoveAt(0); + + return sections; + } + + private string Decode(List section) + { + int encodingIndex = (from line in section + where line.Contains("Content-Transfer-Encoding:") + select section.IndexOf(line)).FirstOrDefault(); + + string contentEncoding = section[encodingIndex].Split(' ')[1]; + var encoding = TransferEncoding.Unknown; + + if (contentEncoding == "7bit") + encoding = TransferEncoding.SevenBit; + //if (contentEncoding == "8bit") Need Framework 4.5 + // encoding = TransferEncoding.EightBit; + if (contentEncoding == "quoted-printable") + encoding = TransferEncoding.QuotedPrintable; + if (contentEncoding.ToLower() == "base64") + encoding = TransferEncoding.Base64; + + int space = (from line in section + where string.IsNullOrWhiteSpace(line) + select section.IndexOf(line)).FirstOrDefault(); + + section.RemoveRange(0, space + 1); + + //section.Remove(section[0]); // boundary + //section.Remove(section[0]); // type + //section.Remove(section[0]); // transfer encoding + + // Remove '=' at the end of each line + for(int i = 0; i < section.Count; i++) + { + string current = section[i]; + + if (current.EndsWith("=")) + section[i] = current.Substring(0, current.Length - 1); + } + + MemoryStream ms; + string lsection = string.Join(string.Empty, section); + + switch (encoding) + { + case TransferEncoding.Base64: + ms = new MemoryStream(Convert.FromBase64String(lsection), false); + break; + + case TransferEncoding.QuotedPrintable: + ms = new MemoryStream(Encoding.UTF8.GetBytes( + string.Join(string.Empty, (from line in section + select POP.MailDecoder.DecodeSpecialChars(line)).ToList()) + )); + break; + + //case TransferEncoding.EightBit: Need framework 4.5 + case TransferEncoding.SevenBit: + case TransferEncoding.Unknown: + default: + ms = new MemoryStream(Encoding.UTF8.GetBytes(lsection), false); + break; + } + + string body; + + using (var sr = new StreamReader(ms)) + body = sr.ReadToEnd(); + ms.Dispose(); + + return body; + } + + private string ExtractBoundary() + { + int boundaryIndex = (from l in lines + where l.StartsWith("Content-Type:") + select lines.IndexOf(l)).FirstOrDefault(); + + if (!lines[boundaryIndex].Contains("boundary=\"")) + boundaryIndex++; + + string boundaryLine = lines[boundaryIndex]; + + if (boundaryLine.StartsWith("\t")) + boundaryLine = boundaryLine.Substring(1); + + boundaryLine = boundaryLine.Substring(boundaryLine.IndexOf('"') + 1); + boundaryLine = boundaryLine.Remove(boundaryLine.Length - 1); + boundaryLine = string.Concat("--", boundaryLine); // MIME logic + + return boundaryLine; + } + } +} diff --git a/POP3Client.cs b/POP3Client.cs index 232ca76..c75ad85 100644 --- a/POP3Client.cs +++ b/POP3Client.cs @@ -160,6 +160,7 @@ private void InternalClose() #endregion #region Internal Send/Receive functions + // These functions will be public until I finish the public API. private void SendCommand(string format, params object[] args) { @@ -458,7 +459,7 @@ public List GetMessages() { var messageList = new List(); - foreach(KeyValuePair kv in ListMessages()) + foreach(KeyValuePair kv in ListMessages()) { POPMessage m = GetMessage(kv.Key); if(m != null) @@ -488,9 +489,7 @@ public POPMessage GetMessage(int messageID) ret = new MailParser(ReceiveMultiLine()).Message; else { - ret = new POPMessage(); - ret.ID = Constants.INVALID; - ret.Body = Protocol.RemoveHeader(response); + ret = new POPMessage { ID = Constants.INVALID, Body = Protocol.RemoveHeader(response) }; } return ret; diff --git a/POPLib.csproj b/POPLib.csproj index 3649333..1490e51 100644 --- a/POPLib.csproj +++ b/POPLib.csproj @@ -48,6 +48,7 @@ + @@ -57,9 +58,6 @@ - - - - + \ No newline at end of file diff --git a/POPMessage.cs b/POPMessage.cs index aa954c1..29e7971 100644 --- a/POPMessage.cs +++ b/POPMessage.cs @@ -11,7 +11,6 @@ public class POPMessage public Person Sender { get; set; } public string Body { get; set; } public string Subject { get; set; } - public Encoding CharSet { get; set; } public DateTime ArrivalTime { get; set; } public bool ContainsHTML { get; set; } public List Header { get; set; }